In [1]:
from nlputils import lexical
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import os

In [2]:
normalizer = lexical.Preprocessing()

In [3]:
corpora_path = 'data/corpora'
files_tecnologia = os.listdir('{}/tecnologia/'.format(corpora_path))
files_tecnologia = ['{}/tecnologia/{}'.format(corpora_path,f) for f in files_tecnologia if f != '.DS_Store']
files_politica = os.listdir('{}/politica/'.format(corpora_path))
files_politica = ['{}/politica/{}'.format(corpora_path,f) for f in files_politica if f != '.DS_Store']

# Word2Vec

### Data preparation

In [4]:
all_sentences = []
for file in files_tecnologia:
    with open(file, 'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            line = normalizer.lowercase(line)
            sentences = normalizer.tokenize_sentences(line)
            sentences = [normalizer.tokenize_words(sent) for sent in sentences]
            all_sentences.extend(sentences)

print("Number of sentences: {}".format(len(all_sentences)))

Number of sentences: 6199


# training

In [5]:
w2vmodel_tecnologia = Word2Vec(all_sentences, size=200, window=5, min_count=3, workers=4)

In [6]:
w2vmodel_tecnologia.wv.most_similar('computador')

[('celular', 0.9963942766189575),
 ('acessar', 0.9922749400138855),
 ('reconfigurar', 0.9879945516586304),
 ('livremente', 0.9865565299987793),
 ('estiver', 0.9862324595451355),
 ('transmitido', 0.9858149886131287),
 ('tecla', 0.9855888485908508),
 ('mantenha', 0.9853802919387817),
 ('usuário', 0.9848452806472778),
 ('seu', 0.9838991165161133)]

# Doc2Vec
https://arxiv.org/pdf/1405.4053v2.pdf

# data preparation

In [7]:
all_documents = []
all_files = files_tecnologia
all_files.extend(files_politica)
for file in all_files:
    with open(file, 'r') as text_file:
        document = ' '.join(text_file.readlines())
        document = normalizer.lowercase(document)
        document_tokens = normalizer.tokenize_words(document)
        all_documents.append(document_tokens)
print("Number of documents: {}".format(len(all_documents)))
tagged_documents = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(all_documents)]

Number of documents: 568


In [8]:
d2vmodel = Doc2Vec(tagged_documents, vector_size=20, window=2, min_count=1, workers=4)

In [9]:
vector_tec = d2vmodel.infer_vector(all_documents[0])
vector_tec2 = d2vmodel.infer_vector(all_documents[1])
vector_pol = d2vmodel.infer_vector(all_documents[len(all_documents)-1])
vector_pol2 = d2vmodel.infer_vector(all_documents[len(all_documents)-2])

In [10]:
# run this block many times
from scipy import spatial

print(1 - spatial.distance.cosine(vector_pol, vector_tec))
print(1 - spatial.distance.cosine(vector_pol, vector_pol2))
print(1 - spatial.distance.cosine(vector_tec, vector_tec2))

0.8832724094390869
0.8805717825889587
0.688778281211853
