# Word Embeddings notícias de 2018

Os dados são notícias de 2018, coletadas dos principais jornais do Brasil:
* `Carta Capital`, 
* `El Pais`,
* `Estadao`, 
* `Folha de São Paulo`, 
*  `Gazeta do Povo`,
* `O Antagonista`, 
* `O Globo`, 
* `Veja`

Uma análise detalhada dos dados está disponível [aqui](https://pages.github.com/). Objetivo deste notebook é utilizar o modelo word2vec para gerar embeddings a partir dos textos dessas notícias. A arquitetura utilizada pelo modelo é a skip-gram, cada palavra é representada por um vetor de 300 dimensões.

In [1]:
# importing modules and setting log format
import re
import nltk
import gensim, logging
import pandas as pd
from nltk.corpus import stopwords
from pymongo import MongoClient
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
PUNCTUATION = u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ%]' # define news punctuation 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/diogoflorencio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Definindo Lexicons e Funções

In [2]:
# Mapping words in lexicons
map_lexicons = {'a ponto':'a_ponto','ao menos ':'ao_menos ','ate mesmo ':'ate_mesmo ',
                'nao mais que ':'nao_mais_que ','nem mesmo ':'nem_mesmo ','no minimo ':'no_minimo ',
                'o unico ':'o_unico ','a unica ':'a_unica ','pelo menos ':'pelo_menos ',
                'quando menos ':'quando_menos ','quando muito ':'quando_muito ','a par disso ':'a_par_disso ',
                'e nao ':'e_nao ','em suma ':'em_suma ','mas tambem ': 'mas_tambem ','muito menos ':'muito_menos ',
                'nao so ':'nao_so ','ou mesmo ':'ou_mesmo ','por sinal ':'por_sinal ','com isso ':'com_isso ',
                'como consequencia ':'como_consequencia ','de modo que ':'de_modo_que ','deste modo ':'deste_modo ',
                'em decorrencia ':'em_decorrencia ','nesse sentido ':'nesse_sentido ','por causa ':'por_causa ',
                'por conseguinte ':'por_conseguinte ','por essa razao ':'por_essa_razao ','por isso ':'por_isso ',
                'sendo assim ':'sendo_assim ','ou entao ':'ou_entao ','ou mesmo ':'ou_mesmo ','como se ':'como_se ',
                'de um lado ':'de_um_lado ','por outro lado ':'por_outro_lado ','mais que ':'mais_que ',
                'menos que ':'menos_que ','desde que ':'desde_que ','do contrario ':'do_contrario ',
                'em lugar ':'em_lugar ','em vez ':'em_vez','no caso ':'no_caso ','se acaso ':'se_acaso ',
                'de certa forma ':'de_certa_forma ','desse modo ':'desse_modo ','em funcao ':'em_funcao ',
                'isso e ':'isso_e ','ja que ':'ja_que ','na medida que ':'na_medida_que ','nessa direcao ':'nessa_direcao ',
                'no intuito ':'no_intuito ','no mesmo sentido ':'no_mesmo_sentido ','ou seja ':'ou_seja ',
                'uma vez que ':'uma_vez_que ','tanto que ':'tanto_que ','visto que ':'visto_que ','ainda que ':'ainda_que ',
                'ao contrario ':'ao_contrario ','apesar de ':'apesar_de ','fora isso ':'fora_isso ','mesmo que ':'mesmo_que ',
                'nao obstante ':'nao_obstante ','nao fosse isso ':'nao_fosse_isso ','no entanto ':'no_entanto ',
                'para tanto ':'para_tanto ','pelo contrario ':'pelo_contrario ','por sua vez ':'por_sua_vez ','posto que ':'posto_que '
               }

In [3]:
# Convert word from text into lexicons
def word2lexicon(text):
    for k, v in map_lexicons.items():
        text = str(text).replace(k,v)
    return text

In [4]:
# function for processing sentences
def processSentences(text):
    stop_words = stopwords.words('portuguese') # load stop words
    text = re.sub(PUNCTUATION, ' ', str(text)) # remove punctuation from text
    text = str(text).split() # split sentences by words
    text = [word for word in text if word not in stop_words] # Remove stopwords
    return text

### Carregando Notícias

In [5]:
host_ip = '192.168.1.7'
# init mongo client
client = MongoClient(host_ip, 27017)
#select db
db = client['news_2018']
# load data
carta_capital = pd.DataFrame(list(db.get_collection('carta_capital').find()))
el_pais = pd.DataFrame(list(db.get_collection('el_pais').find()))
estadao = pd.DataFrame(list(db.get_collection('estadao').find()))
folha = pd.DataFrame(list(db.get_collection('folha').find()))
gazeta_do_povo = pd.DataFrame(list(db.get_collection('gazeta_do_povo').find()))
oantagonista = pd.DataFrame(list(db.get_collection('oantagonista').find()))
oglobo = pd.DataFrame(list(db.get_collection('oglobo').find()))
veja = pd.DataFrame(list(db.get_collection('veja').find()))

# concat all news
news = pd.concat((carta_capital, el_pais, estadao, folha, gazeta_do_povo, oantagonista, oglobo, veja), sort=False, ignore_index=True)

In [6]:
# processing news text
news['text'] = news['text'].apply(word2lexicon) 
news['text'] = news['text'].apply(processSentences)

### Treinando Word2Vec

In [7]:
# Train word2vec model - settings: approach skip-gram, size embeddings vectors 300 
model = gensim.models.Word2Vec(news['text'], workers=4, size=300, sg=1, window=5, min_count=5)
# Saving model
model.save('embeddings/news_w2v.bin')
# Saving embeddings
model.wv.save_word2vec_format("embeddings/news_vectors.bin")

2020-01-04 13:23:01,814 : INFO : collecting all words and their counts
2020-01-04 13:23:01,815 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-04 13:23:02,609 : INFO : PROGRESS: at sentence #10000, processed 4407206 words, keeping 121977 word types
2020-01-04 13:23:02,718 : INFO : PROGRESS: at sentence #20000, processed 4964151 words, keeping 133083 word types
2020-01-04 13:23:03,198 : INFO : PROGRESS: at sentence #30000, processed 7305701 words, keeping 201099 word types
2020-01-04 13:23:03,732 : INFO : PROGRESS: at sentence #40000, processed 10163400 words, keeping 224359 word types
2020-01-04 13:23:04,174 : INFO : PROGRESS: at sentence #50000, processed 12577159 words, keeping 238519 word types
2020-01-04 13:23:04,689 : INFO : PROGRESS: at sentence #60000, processed 15407162 words, keeping 256496 word types
2020-01-04 13:23:05,332 : INFO : PROGRESS: at sentence #70000, processed 18811506 words, keeping 275933 word types
2020-01-04 13:23:05,944 : IN

2020-01-04 13:24:26,624 : INFO : EPOCH 1 - PROGRESS: at 22.18% examples, 253467 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:24:27,626 : INFO : EPOCH 1 - PROGRESS: at 22.64% examples, 253200 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:24:28,683 : INFO : EPOCH 1 - PROGRESS: at 23.19% examples, 253281 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:24:29,684 : INFO : EPOCH 1 - PROGRESS: at 23.77% examples, 253465 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:24:30,763 : INFO : EPOCH 1 - PROGRESS: at 24.23% examples, 252831 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:24:31,832 : INFO : EPOCH 1 - PROGRESS: at 24.80% examples, 252843 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:24:32,833 : INFO : EPOCH 1 - PROGRESS: at 25.32% examples, 253028 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:24:33,836 : INFO : EPOCH 1 - PROGRESS: at 25.87% examples, 253358 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:24:34,872 : INFO : EPOCH 1 - PROGRESS: at 26.38% examples, 253132 words/s, in_qsiz

2020-01-04 13:25:41,911 : INFO : EPOCH 1 - PROGRESS: at 66.31% examples, 254037 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:25:42,940 : INFO : EPOCH 1 - PROGRESS: at 66.77% examples, 254245 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:25:43,984 : INFO : EPOCH 1 - PROGRESS: at 67.17% examples, 254113 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:25:44,991 : INFO : EPOCH 1 - PROGRESS: at 67.56% examples, 253984 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:25:46,001 : INFO : EPOCH 1 - PROGRESS: at 67.96% examples, 253996 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:25:47,012 : INFO : EPOCH 1 - PROGRESS: at 68.38% examples, 254024 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:25:48,081 : INFO : EPOCH 1 - PROGRESS: at 68.84% examples, 254146 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:25:49,094 : INFO : EPOCH 1 - PROGRESS: at 69.26% examples, 254142 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:25:50,094 : INFO : EPOCH 1 - PROGRESS: at 69.61% examples, 254015 words/s, in_qsiz

2020-01-04 13:26:54,081 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-04 13:26:54,112 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-04 13:26:54,112 : INFO : EPOCH - 1 : training on 49593007 raw words (48639129 effective words) took 192.0s, 253311 effective words/s
2020-01-04 13:26:55,176 : INFO : EPOCH 2 - PROGRESS: at 0.36% examples, 228770 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:26:56,196 : INFO : EPOCH 2 - PROGRESS: at 0.61% examples, 241847 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:26:57,238 : INFO : EPOCH 2 - PROGRESS: at 0.87% examples, 246865 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:26:58,239 : INFO : EPOCH 2 - PROGRESS: at 1.12% examples, 246358 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:26:59,261 : INFO : EPOCH 2 - PROGRESS: at 1.49% examples, 250625 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:27:00,275 : INFO : EPOCH 2 - PROGRESS: at 1.77% examples, 248392 words/s, in_qsize 8, out_qsize 0
2020-

2020-01-04 13:28:07,313 : INFO : EPOCH 2 - PROGRESS: at 34.19% examples, 253279 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:28:08,336 : INFO : EPOCH 2 - PROGRESS: at 34.56% examples, 253173 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:28:09,364 : INFO : EPOCH 2 - PROGRESS: at 34.92% examples, 253184 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:28:10,367 : INFO : EPOCH 2 - PROGRESS: at 35.27% examples, 253143 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:28:11,381 : INFO : EPOCH 2 - PROGRESS: at 35.60% examples, 253081 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:28:12,423 : INFO : EPOCH 2 - PROGRESS: at 35.92% examples, 253032 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:28:13,448 : INFO : EPOCH 2 - PROGRESS: at 36.26% examples, 252994 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:28:14,464 : INFO : EPOCH 2 - PROGRESS: at 36.69% examples, 252918 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:28:15,504 : INFO : EPOCH 2 - PROGRESS: at 37.13% examples, 252998 words/s, in_qsiz

2020-01-04 13:29:22,967 : INFO : EPOCH 2 - PROGRESS: at 77.87% examples, 253472 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:29:24,005 : INFO : EPOCH 2 - PROGRESS: at 78.31% examples, 253494 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:29:25,064 : INFO : EPOCH 2 - PROGRESS: at 78.75% examples, 253501 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:29:26,072 : INFO : EPOCH 2 - PROGRESS: at 79.17% examples, 253519 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:29:27,091 : INFO : EPOCH 2 - PROGRESS: at 79.54% examples, 253393 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:29:28,148 : INFO : EPOCH 2 - PROGRESS: at 79.96% examples, 253334 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:29:29,150 : INFO : EPOCH 2 - PROGRESS: at 80.35% examples, 253354 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:29:30,195 : INFO : EPOCH 2 - PROGRESS: at 80.72% examples, 253187 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:29:31,197 : INFO : EPOCH 2 - PROGRESS: at 81.09% examples, 253047 words/s, in_qsiz

2020-01-04 13:30:34,424 : INFO : EPOCH 3 - PROGRESS: at 14.37% examples, 252146 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:30:35,455 : INFO : EPOCH 3 - PROGRESS: at 14.76% examples, 252053 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:30:36,478 : INFO : EPOCH 3 - PROGRESS: at 15.67% examples, 251812 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:30:37,530 : INFO : EPOCH 3 - PROGRESS: at 16.11% examples, 251590 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:30:38,560 : INFO : EPOCH 3 - PROGRESS: at 16.47% examples, 251586 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:30:39,579 : INFO : EPOCH 3 - PROGRESS: at 16.87% examples, 251710 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:30:40,621 : INFO : EPOCH 3 - PROGRESS: at 17.23% examples, 251346 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:30:41,684 : INFO : EPOCH 3 - PROGRESS: at 17.65% examples, 251372 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:30:42,761 : INFO : EPOCH 3 - PROGRESS: at 18.05% examples, 251234 words/s, in_qsiz

2020-01-04 13:31:50,038 : INFO : EPOCH 3 - PROGRESS: at 45.85% examples, 251697 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:31:51,092 : INFO : EPOCH 3 - PROGRESS: at 46.27% examples, 251762 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:31:52,104 : INFO : EPOCH 3 - PROGRESS: at 46.64% examples, 251814 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:31:53,151 : INFO : EPOCH 3 - PROGRESS: at 47.01% examples, 251714 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:31:54,173 : INFO : EPOCH 3 - PROGRESS: at 47.40% examples, 251859 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:31:55,181 : INFO : EPOCH 3 - PROGRESS: at 47.80% examples, 252106 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:31:56,221 : INFO : EPOCH 3 - PROGRESS: at 48.16% examples, 252092 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:31:57,260 : INFO : EPOCH 3 - PROGRESS: at 51.02% examples, 252198 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:31:58,264 : INFO : EPOCH 3 - PROGRESS: at 53.68% examples, 252293 words/s, in_qsiz

2020-01-04 13:33:05,377 : INFO : EPOCH 3 - PROGRESS: at 93.64% examples, 251598 words/s, in_qsize 6, out_qsize 1
2020-01-04 13:33:06,401 : INFO : EPOCH 3 - PROGRESS: at 94.11% examples, 251553 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:33:07,430 : INFO : EPOCH 3 - PROGRESS: at 94.63% examples, 251613 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:33:08,468 : INFO : EPOCH 3 - PROGRESS: at 95.16% examples, 251659 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:33:09,505 : INFO : EPOCH 3 - PROGRESS: at 95.65% examples, 251592 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:33:10,560 : INFO : EPOCH 3 - PROGRESS: at 96.27% examples, 251562 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:33:11,569 : INFO : EPOCH 3 - PROGRESS: at 96.97% examples, 251599 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:33:12,577 : INFO : EPOCH 3 - PROGRESS: at 97.32% examples, 251498 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:33:13,641 : INFO : EPOCH 3 - PROGRESS: at 97.78% examples, 251506 words/s, in_qsiz

2020-01-04 13:34:17,165 : INFO : EPOCH 4 - PROGRESS: at 28.39% examples, 253266 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:34:18,217 : INFO : EPOCH 4 - PROGRESS: at 28.68% examples, 252899 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:34:19,240 : INFO : EPOCH 4 - PROGRESS: at 29.01% examples, 252889 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:34:20,267 : INFO : EPOCH 4 - PROGRESS: at 29.37% examples, 252868 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:34:21,278 : INFO : EPOCH 4 - PROGRESS: at 29.68% examples, 252735 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:34:22,354 : INFO : EPOCH 4 - PROGRESS: at 30.06% examples, 252648 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:34:23,438 : INFO : EPOCH 4 - PROGRESS: at 30.42% examples, 252561 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:34:24,464 : INFO : EPOCH 4 - PROGRESS: at 30.77% examples, 252672 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:34:25,472 : INFO : EPOCH 4 - PROGRESS: at 31.16% examples, 252684 words/s, in_qsiz

2020-01-04 13:35:33,102 : INFO : EPOCH 4 - PROGRESS: at 71.38% examples, 252724 words/s, in_qsize 6, out_qsize 1
2020-01-04 13:35:34,151 : INFO : EPOCH 4 - PROGRESS: at 71.82% examples, 252757 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:35:35,186 : INFO : EPOCH 4 - PROGRESS: at 72.21% examples, 252666 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:35:36,197 : INFO : EPOCH 4 - PROGRESS: at 72.61% examples, 252625 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:35:37,222 : INFO : EPOCH 4 - PROGRESS: at 73.02% examples, 252627 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:35:38,254 : INFO : EPOCH 4 - PROGRESS: at 73.41% examples, 252600 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:35:39,314 : INFO : EPOCH 4 - PROGRESS: at 73.81% examples, 252541 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:35:40,379 : INFO : EPOCH 4 - PROGRESS: at 74.23% examples, 252526 words/s, in_qsize 8, out_qsize 1
2020-01-04 13:35:41,448 : INFO : EPOCH 4 - PROGRESS: at 74.66% examples, 252518 words/s, in_qsiz

2020-01-04 13:36:45,434 : INFO : EPOCH 5 - PROGRESS: at 3.47% examples, 243318 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:36:46,450 : INFO : EPOCH 5 - PROGRESS: at 3.78% examples, 242766 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:36:47,463 : INFO : EPOCH 5 - PROGRESS: at 4.06% examples, 243530 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:36:48,530 : INFO : EPOCH 5 - PROGRESS: at 4.32% examples, 242866 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:36:49,568 : INFO : EPOCH 5 - PROGRESS: at 4.56% examples, 244791 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:36:50,596 : INFO : EPOCH 5 - PROGRESS: at 4.80% examples, 244086 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:36:51,603 : INFO : EPOCH 5 - PROGRESS: at 6.28% examples, 244417 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:36:52,622 : INFO : EPOCH 5 - PROGRESS: at 8.84% examples, 244714 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:36:53,628 : INFO : EPOCH 5 - PROGRESS: at 11.16% examples, 245134 words/s, in_qsize 7, out

2020-01-04 13:38:01,266 : INFO : EPOCH 5 - PROGRESS: at 39.34% examples, 248747 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:38:02,313 : INFO : EPOCH 5 - PROGRESS: at 39.77% examples, 248640 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:38:03,369 : INFO : EPOCH 5 - PROGRESS: at 40.21% examples, 248733 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:38:04,451 : INFO : EPOCH 5 - PROGRESS: at 40.63% examples, 248749 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:38:05,521 : INFO : EPOCH 5 - PROGRESS: at 41.07% examples, 248804 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:38:06,585 : INFO : EPOCH 5 - PROGRESS: at 41.51% examples, 248874 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:38:07,602 : INFO : EPOCH 5 - PROGRESS: at 41.90% examples, 248844 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:38:08,665 : INFO : EPOCH 5 - PROGRESS: at 42.31% examples, 248798 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:38:09,671 : INFO : EPOCH 5 - PROGRESS: at 42.72% examples, 248897 words/s, in_qsiz

2020-01-04 13:39:17,102 : INFO : EPOCH 5 - PROGRESS: at 83.02% examples, 249669 words/s, in_qsize 7, out_qsize 0
2020-01-04 13:39:18,140 : INFO : EPOCH 5 - PROGRESS: at 83.35% examples, 249699 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:19,166 : INFO : EPOCH 5 - PROGRESS: at 84.31% examples, 249652 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:20,209 : INFO : EPOCH 5 - PROGRESS: at 84.95% examples, 249596 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:21,226 : INFO : EPOCH 5 - PROGRESS: at 87.19% examples, 249819 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:22,254 : INFO : EPOCH 5 - PROGRESS: at 87.85% examples, 249857 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:23,336 : INFO : EPOCH 5 - PROGRESS: at 88.39% examples, 249839 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:24,437 : INFO : EPOCH 5 - PROGRESS: at 88.90% examples, 249795 words/s, in_qsize 8, out_qsize 0
2020-01-04 13:39:25,527 : INFO : EPOCH 5 - PROGRESS: at 89.43% examples, 249784 words/s, in_qsiz

In [8]:
print(model.wv.most_similar(positive=[u'pontes'], negative=[u'presidente']))

2020-01-04 13:40:23,902 : INFO : precomputing L2-norms of word weight vectors


[('viadutos', 0.46708548069000244), ('marcos', 0.31245994567871094), ('ciclovias', 0.29528993368148804), ('piscinão', 0.293051153421402), ('bueiros', 0.2902679443359375), ('asfaltadas', 0.2897021770477295), ('esburacadas', 0.28716373443603516), ('túneis', 0.2824742794036865), ('ciclofaixas', 0.2821462154388428), ('interliga', 0.27868783473968506)]


In [9]:
from gensim.models import Word2Vec
new_model = Word2Vec.load('embeddings/news_w2v.bin')

2020-01-04 13:40:24,682 : INFO : loading Word2Vec object from embeddings/news_w2v.bin
2020-01-04 13:40:25,156 : INFO : loading wv recursively from embeddings/news_w2v.bin.wv.* with mmap=None
2020-01-04 13:40:25,157 : INFO : loading vectors from embeddings/news_w2v.bin.wv.vectors.npy with mmap=None
2020-01-04 13:40:25,479 : INFO : setting ignored attribute vectors_norm to None
2020-01-04 13:40:25,480 : INFO : loading vocabulary recursively from embeddings/news_w2v.bin.vocabulary.* with mmap=None
2020-01-04 13:40:25,481 : INFO : loading trainables recursively from embeddings/news_w2v.bin.trainables.* with mmap=None
2020-01-04 13:40:25,481 : INFO : loading syn1neg from embeddings/news_w2v.bin.trainables.syn1neg.npy with mmap=None
2020-01-04 13:40:25,808 : INFO : setting ignored attribute cum_table to None
2020-01-04 13:40:25,808 : INFO : loaded embeddings/news_w2v.bin


In [10]:
print(new_model.wv.most_similar(positive=[u'guedes']))

2020-01-04 13:40:26,115 : INFO : precomputing L2-norms of word weight vectors


[('Guedes', 0.7349788546562195), ('superministro', 0.6698384881019592), ('ultraliberal', 0.598595380783081), ('guru', 0.5823590755462646), ('arida', 0.5743707418441772), ('giambiagi', 0.573410153388977), ('superministérios', 0.5695195198059082), ('armínio', 0.5653104782104492), ('savona', 0.5627946853637695), ('pérsio', 0.5608222484588623)]
