In [1]:
from franc_lib import lexical
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import os

In [2]:
normalizer = lexical.Preprocessing(file_name='corpus_politics')

In [3]:
corpora_path = 'data/corpora'
files_tecnologia = os.listdir('{}/saude/'.format(corpora_path))
files_tecnologia = ['{}/saude/{}'.format(corpora_path,f) for f in files_tecnologia if f != '.DS_Store']
files_politica = os.listdir('{}/politica/'.format(corpora_path))
files_politica = ['{}/politica/{}'.format(corpora_path,f) for f in files_politica if f != '.DS_Store']

# Word2Vec

### Data preparation

In [5]:
all_sentences = []
for file in files_politica:
    with open(file, 'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            line = normalizer.lowercase(line)
            sentences = normalizer.tokenize_sentences(line, save=False)
            sentences = [normalizer.tokenize_words(sent, save=False) for sent in sentences]
            all_sentences.extend(sentences)

print("Number of sentences: {}".format(len(all_sentences)))
print(all_sentences[:3])

Number of sentences: 17222
[['sobre', 'o', 'blogbastidores', 'e', 'informações', 'exclusivas', 'sobre', 'política', ',', 'economia', ',', 'negócios', ',', 'esporte', ',', 'cultura', '-', 'e', 'tudo', 'o', 'mais', 'que', 'for', 'relevantesobre', 'o', 'autorlauro', 'jardimcomeçou', 'no', 'jornalismo', 'em', '1989', ',', 'no', 'globo', '.'], ['passou', 'pelas', 'redações', 'de', 'istoé', ',', 'jb', 'e', 'exame', '.'], ['entre', '1998', 'e', 'setembro', 'de', '2015', ',', 'trabalhou', 'em', 'veja', ',', 'onde', 'foi', 'chefe', 'da', 'sucursal', 'do', 'rio', ',', 'redator-chefe', 'e', 'editor', 'da', 'coluna', 'radar', 'desde', '2000.', 'voltou', 'ao', 'globo', 'em', '2015athos', 'mouracomeçou', 'a', 'carreira', 'no', 'globo', 'em', '2009.', 'é', 'amante', 'da', 'história', 'do', 'brasil', ',', 'e', 'se', 'interessa', 'por', 'todo', 'tipo', 'de', 'notícia', ',', 'dos', 'bastidores', 'da', 'política', 'nacional', 'aos', 'times', 'de', 'futebol', 'de', 'várzea', '.']]


# training

In [5]:
# size: tamanho do vetor que representa as palavras, window: janela de palavras para contexto, 
# min_count: minimo de palavras para ser considerada, workers: 
w2vmodel_tecnologia = Word2Vec(all_sentences, size=200, window=5, min_count=3, workers=4)

In [6]:
w2vmodel_tecnologia.wv.most_similar('ministro')
# Aplicação = Sistema de busca para encontrar nao so o que foi buscado mas algo que seja sinonimo

[('presidente', 0.9792462587356567),
 ('presidido', 0.9756972789764404),
 ('stf', 0.9743261337280273),
 ('interino', 0.965011715888977),
 ('turismo', 0.9639201760292053),
 ('noronha', 0.9636476635932922),
 ('deputado', 0.9612151384353638),
 ('repúblicao', 0.9611080884933472),
 ('congresso.o', 0.9607368111610413),
 ('moro', 0.9596492052078247)]

# Doc2Vec
https://arxiv.org/pdf/1405.4053v2.pdf

# data preparation

In [7]:
all_documents = []
all_files = files_tecnologia
all_files.extend(files_politica)
for file in all_files:
    with open(file, 'r') as text_file:
        document = ' '.join(text_file.readlines())
        document = normalizer.lowercase(document)
        document_tokens = normalizer.tokenize_words(document)
        all_documents.append(document_tokens)
print("Number of documents: {}".format(len(all_documents)))
tagged_documents = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(all_documents)]

Number of documents: 275


In [8]:
d2vmodel = Doc2Vec(tagged_documents, vector_size=20, window=2, min_count=1, workers=4)

In [9]:
vector_tec = d2vmodel.infer_vector(all_documents[0])
vector_tec2 = d2vmodel.infer_vector(all_documents[1])
vector_pol = d2vmodel.infer_vector(all_documents[len(all_documents)-1])
vector_pol2 = d2vmodel.infer_vector(all_documents[len(all_documents)-2])

In [10]:
# run this block many times
from scipy import spatial

print(1 - spatial.distance.cosine(vector_pol, vector_tec))
print(1 - spatial.distance.cosine(vector_pol, vector_pol2))
print(1 - spatial.distance.cosine(vector_tec, vector_tec2))

-0.35652104020118713
0.9581257700920105
1.0
