In [1]:
import math
import pandas as pd
import re
from functools import reduce

In [2]:
def calculate_corpus(general_glossary, glossary):
    v = [0] * len(general_glossary)
    for word in glossary['Termino']:
        v[general_glossary[word]] = 1
    return v

def calculate_corpus_new(general_glossary, new):
    v = [0] * len(general_glossary)
    for word in new:
        try:
            v[general_glossary[word]] += 1
        except:
            pass
    return v
        
def cos_distance(v1, v2):
    n = sum(map(lambda t: t[0]*t[1], zip(v1, v2)))
    dist_v1 = math.sqrt(sum(map(lambda x: x*x, v1)))
    dist_v2 = math.sqrt(sum(map(lambda x: x*x, v2)))
    d = dist_v1 * dist_v2
    return n/d if d != 0 else -1
    
def read_csv(path, sep):
    csv = pd.read_csv(path, sep=sep)
    return csv

def read_file(path):
    return ''.join(str(e) for e in open(path, 'r', encoding='utf-8').readlines())

In [3]:
# We use the news in ./texts for creating the glossaries that allows to classify the texts properly. We create the glossaries by using SimpleExtractor, by DAIL Software, UPM.

# Glossaries reading.
politics_glossary = read_csv('./extractions/politics.csv', ';')
sports_glossary = read_csv('./extractions/sports.csv', ';')
science_glossary = read_csv('./extractions/science.csv', ';')

# We create a list which contains the words of the glossaries.
common_glossary = set(list(politics_glossary['Termino']) + list(sports_glossary['Termino']) + list(science_glossary['Termino']))
only_words =  dict(map(reversed, enumerate(sorted(common_glossary))))

# Creation of corpus of each glossary.
politics_corpus = calculate_corpus(only_words, politics_glossary)
sports_corpus = calculate_corpus(only_words, sports_glossary)
science_corpus = calculate_corpus(only_words, science_glossary)

# We prepare a set of news to classify. They are in ./predict.
news = [read_file('./predict/politics-01.txt'), read_file('./predict/politics-02.txt'),
            read_file('./predict/sports-01.txt'), read_file('./predict/sports-02.txt'),
            read_file('./predict/science-01.txt'), read_file('./predict/science-02.txt')]

# Classification process.
news_classification = dict(
    map(lambda nombre: (nombre, 0), ('politics', 'sports', 'science'))
)

pattern = re.compile(r'\w+')
for new in news:
    new_words = pattern.findall(new)
    new_corpus = calculate_corpus_new(only_words, new_words)
    print(new_corpus)

    cos_politics = cos_distance(politics_corpus, new_corpus)
    cos_sports = cos_distance(sports_corpus, new_corpus)
    cos_science = cos_distance(science_corpus, new_corpus)
    
    if cos_politics > cos_sports:
        if cos_politics > cos_science:
            new_type = 'politics'
        else:
            new_type = 'science'
    else:
        if cos_sports > cos_science:
            new_type = 'sports'
        else:
            new_type = 'science'
            
    news_classification[new_type] += 1
    print('Type: ', new_type)
    
news_classification

[0, 0, 1, 0, 0, 7, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 2, 0, 1, 0, 6, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 0, 1, 0, 0, 1, 0, 1]
Type:  politics
[0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1

{'politics': 2, 'sports': 2, 'science': 2}