# Word Embeddings Comment's news 2018

Os dados são comentários de notícias coletadas em 2018 dos seguintes jornais do Brasil:

* `O Antagonista`, 
* `O Globo`, 
* `Veja`.

Uma análise detalhada dos dados está disponível [aqui](https://pages.github.com/). Objetivo deste notebook é utilizar o modelo word2vec para gerar embeddings a partir dos textos desses comentários. A arquitetura utilizada pelo modelo é a skip-gram, cada palavra é representada por um vetor de 300 dimensões.

In [1]:
# importing modules and setting log format
import re
import nltk
import gensim, logging
import pandas as pd
from nltk.corpus import stopwords
from pymongo import MongoClient
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
PUNCTUATION = u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ%]' # define news punctuation 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/diogoflorencio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Definindo Lexicons e Funções

In [2]:
# loading abbreviation dic
with open('../dics/AB_dict') as f:
    abbreviation = f.readlines()
# process dic    
abbreviation = [x.split() for x in abbreviation]
abbreviation = {line[0]: line[1] for line in abbreviation}

# loading internet_slang dic
with open('../dics/IN_dict') as f:
    internet_slang = f.readlines()
# process dic    
internet_slang = [x.split() for x in internet_slang]
internet_slang = {line[0]: ' '.join(line[1:]) for line in internet_slang}

# loading foreign_word dic
with open('../dics/ES_dict') as f:
    foreign_word = f.readlines()
# process dic    
foreign_word = [x.split() for x in foreign_word]
foreign_word = [line for line in foreign_word if len(line) > 1] # selecting valid lines
foreign_word = {line[0]: ' '.join(line[1:]) for line in foreign_word}

# Mapping words in lexicons
map_lexicons = {'a ponto':'a_ponto','ao menos ':'ao_menos ','ate mesmo ':'ate_mesmo ',
                'nao mais que ':'nao_mais_que ','nem mesmo ':'nem_mesmo ','no minimo ':'no_minimo ',
                'o unico ':'o_unico ','a unica ':'a_unica ','pelo menos ':'pelo_menos ',
                'quando menos ':'quando_menos ','quando muito ':'quando_muito ','a par disso ':'a_par_disso ',
                'e nao ':'e_nao ','em suma ':'em_suma ','mas tambem ': 'mas_tambem ','muito menos ':'muito_menos ',
                'nao so ':'nao_so ','ou mesmo ':'ou_mesmo ','por sinal ':'por_sinal ','com isso ':'com_isso ',
                'como consequencia ':'como_consequencia ','de modo que ':'de_modo_que ','deste modo ':'deste_modo ',
                'em decorrencia ':'em_decorrencia ','nesse sentido ':'nesse_sentido ','por causa ':'por_causa ',
                'por conseguinte ':'por_conseguinte ','por essa razao ':'por_essa_razao ','por isso ':'por_isso ',
                'sendo assim ':'sendo_assim ','ou entao ':'ou_entao ','ou mesmo ':'ou_mesmo ','como se ':'como_se ',
                'de um lado ':'de_um_lado ','por outro lado ':'por_outro_lado ','mais que ':'mais_que ',
                'menos que ':'menos_que ','desde que ':'desde_que ','do contrario ':'do_contrario ',
                'em lugar ':'em_lugar ','em vez ':'em_vez','no caso ':'no_caso ','se acaso ':'se_acaso ',
                'de certa forma ':'de_certa_forma ','desse modo ':'desse_modo ','em funcao ':'em_funcao ',
                'isso e ':'isso_e ','ja que ':'ja_que ','na medida que ':'na_medida_que ','nessa direcao ':'nessa_direcao ',
                'no intuito ':'no_intuito ','no mesmo sentido ':'no_mesmo_sentido ','ou seja ':'ou_seja ',
                'uma vez que ':'uma_vez_que ','tanto que ':'tanto_que ','visto que ':'visto_que ','ainda que ':'ainda_que ',
                'ao contrario ':'ao_contrario ','apesar de ':'apesar_de ','fora isso ':'fora_isso ','mesmo que ':'mesmo_que ',
                'nao obstante ':'nao_obstante ','nao fosse isso ':'nao_fosse_isso ','no entanto ':'no_entanto ',
                'para tanto ':'para_tanto ','pelo contrario ':'pelo_contrario ','por sua vez ':'por_sua_vez ','posto que ':'posto_que '
               }

In [3]:
# Convert word from text into lexicons
def word2lexicon(text):
    for k, v in map_lexicons.items():
        text = str(text).replace(k,v)
    return text

In [4]:
# Lexical normalization
def lexical_normalization(text):
    for k, v in abbreviation.items():
        text = str(text).replace(k,v)        
    for k, v in internet_slang.items():
        text = str(text).replace(k,v)        
    for k, v in foreign_word.items():
        text = str(text).replace(k,v)
    return text

In [5]:
# function for processing sentences
def process_sentences(text):
    stop_words = stopwords.words('portuguese') # load stop words
    text = re.sub(PUNCTUATION, ' ', str(text)) # remove punctuation from text
    text = str(text).split() # split sentences by words
    text = [word for word in text if word not in stop_words] # Remove stopwords
    return text

### Carregando Comentários

In [6]:
host_ip = '192.168.1.6'
# init mongo client
client = MongoClient(host_ip, 27017)

#select db
db = client['news_2018']

# load data
oantagonistaComments = pd.DataFrame(list(db.get_collection('oantagonistaComments').find()))
ogloboComments = pd.DataFrame(list(db.get_collection('ogloboComments').find()))
vejaComments = pd.DataFrame(list(db.get_collection('vejaComments').find()))

# concat all comments
comments = pd.concat((oantagonistaComments, ogloboComments, vejaComments), sort=False, ignore_index=True)

In [7]:
# processing news text
comments['text'] = comments['text'].apply(lexical_normalization)
comments['text'] = comments['text'].apply(word2lexicon)  
comments['text'] = comments['text'].apply(process_sentences)

### Treinando Word2Vec

In [8]:
# Train word2vec model - settings: approach skip-gram, size embeddings vectors 300 
model = gensim.models.Word2Vec(comments['text'], workers=4, size=300, sg=1, window=5, min_count=5)
# Saving model
model.save('../embeddings/comments_w2v.bin')
# Saving embeddings
model.wv.save_word2vec_format("../embeddings/comments_vectors.bin")

2020-02-09 14:02:41,175 : INFO : collecting all words and their counts
2020-02-09 14:02:41,176 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-02-09 14:02:41,215 : INFO : PROGRESS: at sentence #10000, processed 110858 words, keeping 22669 word types
2020-02-09 14:02:41,252 : INFO : PROGRESS: at sentence #20000, processed 225540 words, keeping 34466 word types
2020-02-09 14:02:41,289 : INFO : PROGRESS: at sentence #30000, processed 342457 words, keeping 43434 word types
2020-02-09 14:02:41,325 : INFO : PROGRESS: at sentence #40000, processed 458816 words, keeping 51499 word types
2020-02-09 14:02:41,358 : INFO : PROGRESS: at sentence #50000, processed 569597 words, keeping 57916 word types
2020-02-09 14:02:41,396 : INFO : PROGRESS: at sentence #60000, processed 677502 words, keeping 63560 word types
2020-02-09 14:02:41,429 : INFO : PROGRESS: at sentence #70000, processed 784847 words, keeping 68568 word types
2020-02-09 14:02:41,463 : INFO : PROGRESS: at 

2020-02-09 14:02:43,920 : INFO : PROGRESS: at sentence #720000, processed 8316321 words, keeping 230942 word types
2020-02-09 14:02:43,957 : INFO : PROGRESS: at sentence #730000, processed 8436363 words, keeping 232500 word types
2020-02-09 14:02:43,994 : INFO : PROGRESS: at sentence #740000, processed 8559259 words, keeping 234115 word types
2020-02-09 14:02:44,032 : INFO : PROGRESS: at sentence #750000, processed 8677758 words, keeping 235635 word types
2020-02-09 14:02:44,065 : INFO : PROGRESS: at sentence #760000, processed 8799671 words, keeping 237255 word types
2020-02-09 14:02:44,108 : INFO : PROGRESS: at sentence #770000, processed 8922258 words, keeping 238840 word types
2020-02-09 14:02:44,145 : INFO : PROGRESS: at sentence #780000, processed 9043249 words, keeping 240461 word types
2020-02-09 14:02:44,183 : INFO : PROGRESS: at sentence #790000, processed 9171427 words, keeping 241882 word types
2020-02-09 14:02:44,218 : INFO : PROGRESS: at sentence #800000, processed 929472

2020-02-09 14:02:46,872 : INFO : PROGRESS: at sentence #1430000, processed 16863748 words, keeping 343525 word types
2020-02-09 14:02:46,912 : INFO : PROGRESS: at sentence #1440000, processed 16990495 words, keeping 345261 word types
2020-02-09 14:02:46,952 : INFO : PROGRESS: at sentence #1450000, processed 17117283 words, keeping 346962 word types
2020-02-09 14:02:46,994 : INFO : PROGRESS: at sentence #1460000, processed 17238306 words, keeping 348572 word types
2020-02-09 14:02:47,065 : INFO : PROGRESS: at sentence #1470000, processed 17359531 words, keeping 350184 word types
2020-02-09 14:02:47,111 : INFO : PROGRESS: at sentence #1480000, processed 17485224 words, keeping 351803 word types
2020-02-09 14:02:47,149 : INFO : PROGRESS: at sentence #1490000, processed 17604631 words, keeping 353320 word types
2020-02-09 14:02:47,190 : INFO : PROGRESS: at sentence #1500000, processed 17726457 words, keeping 354822 word types
2020-02-09 14:02:47,231 : INFO : PROGRESS: at sentence #1510000,

2020-02-09 14:02:49,836 : INFO : PROGRESS: at sentence #2140000, processed 25539552 words, keeping 441232 word types
2020-02-09 14:02:49,891 : INFO : PROGRESS: at sentence #2150000, processed 25668378 words, keeping 442659 word types
2020-02-09 14:02:49,936 : INFO : PROGRESS: at sentence #2160000, processed 25786962 words, keeping 443914 word types
2020-02-09 14:02:49,983 : INFO : PROGRESS: at sentence #2170000, processed 25910285 words, keeping 445262 word types
2020-02-09 14:02:50,024 : INFO : PROGRESS: at sentence #2180000, processed 26036120 words, keeping 446583 word types
2020-02-09 14:02:50,060 : INFO : PROGRESS: at sentence #2190000, processed 26159493 words, keeping 447756 word types
2020-02-09 14:02:50,110 : INFO : PROGRESS: at sentence #2200000, processed 26311093 words, keeping 449247 word types
2020-02-09 14:02:50,169 : INFO : PROGRESS: at sentence #2210000, processed 26517654 words, keeping 450899 word types
2020-02-09 14:02:50,237 : INFO : PROGRESS: at sentence #2220000,

2020-02-09 14:04:02,303 : INFO : EPOCH 1 - PROGRESS: at 47.69% examples, 264857 words/s, in_qsize 8, out_qsize 1
2020-02-09 14:04:03,305 : INFO : EPOCH 1 - PROGRESS: at 48.68% examples, 264887 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:04:04,390 : INFO : EPOCH 1 - PROGRESS: at 49.65% examples, 264448 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:04:05,446 : INFO : EPOCH 1 - PROGRESS: at 50.76% examples, 264189 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:04:06,448 : INFO : EPOCH 1 - PROGRESS: at 51.97% examples, 264761 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:04:07,457 : INFO : EPOCH 1 - PROGRESS: at 53.11% examples, 264730 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:04:08,466 : INFO : EPOCH 1 - PROGRESS: at 54.25% examples, 264713 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:04:09,493 : INFO : EPOCH 1 - PROGRESS: at 55.39% examples, 264953 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:04:10,543 : INFO : EPOCH 1 - PROGRESS: at 56.57% examples, 265078 words/s, in_qsiz

2020-02-09 14:05:13,938 : INFO : EPOCH 2 - PROGRESS: at 18.80% examples, 268771 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:05:15,012 : INFO : EPOCH 2 - PROGRESS: at 20.03% examples, 269668 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:05:16,024 : INFO : EPOCH 2 - PROGRESS: at 21.17% examples, 270312 words/s, in_qsize 6, out_qsize 1
2020-02-09 14:05:17,035 : INFO : EPOCH 2 - PROGRESS: at 22.21% examples, 269991 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:05:18,057 : INFO : EPOCH 2 - PROGRESS: at 23.23% examples, 269564 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:05:19,059 : INFO : EPOCH 2 - PROGRESS: at 24.23% examples, 269372 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:05:20,072 : INFO : EPOCH 2 - PROGRESS: at 25.31% examples, 269875 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:05:21,074 : INFO : EPOCH 2 - PROGRESS: at 26.33% examples, 269705 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:05:22,094 : INFO : EPOCH 2 - PROGRESS: at 27.27% examples, 269002 words/s, in_qsiz

2020-02-09 14:06:28,955 : INFO : EPOCH 2 - PROGRESS: at 93.38% examples, 265118 words/s, in_qsize 5, out_qsize 2
2020-02-09 14:06:29,995 : INFO : EPOCH 2 - PROGRESS: at 94.37% examples, 265014 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:06:31,022 : INFO : EPOCH 2 - PROGRESS: at 95.23% examples, 264956 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:06:32,054 : INFO : EPOCH 2 - PROGRESS: at 95.83% examples, 264993 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:06:33,087 : INFO : EPOCH 2 - PROGRESS: at 96.39% examples, 264929 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:06:34,178 : INFO : EPOCH 2 - PROGRESS: at 96.99% examples, 264809 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:06:35,221 : INFO : EPOCH 2 - PROGRESS: at 97.93% examples, 264802 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:06:36,227 : INFO : EPOCH 2 - PROGRESS: at 98.87% examples, 264893 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:06:37,299 : INFO : EPOCH 2 - PROGRESS: at 99.80% examples, 264721 words/s, in_qsiz

2020-02-09 14:07:40,485 : INFO : EPOCH 3 - PROGRESS: at 65.46% examples, 270614 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:07:41,531 : INFO : EPOCH 3 - PROGRESS: at 66.47% examples, 270348 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:07:42,540 : INFO : EPOCH 3 - PROGRESS: at 67.44% examples, 270245 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:07:43,546 : INFO : EPOCH 3 - PROGRESS: at 68.44% examples, 270300 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:07:44,609 : INFO : EPOCH 3 - PROGRESS: at 69.46% examples, 270131 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:07:45,628 : INFO : EPOCH 3 - PROGRESS: at 70.43% examples, 270005 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:07:46,768 : INFO : EPOCH 3 - PROGRESS: at 71.50% examples, 269688 words/s, in_qsize 6, out_qsize 1
2020-02-09 14:07:47,814 : INFO : EPOCH 3 - PROGRESS: at 72.39% examples, 268931 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:07:48,881 : INFO : EPOCH 3 - PROGRESS: at 73.47% examples, 268655 words/s, in_qsiz

2020-02-09 14:08:52,073 : INFO : EPOCH 4 - PROGRESS: at 40.49% examples, 291282 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:08:53,106 : INFO : EPOCH 4 - PROGRESS: at 41.58% examples, 291389 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:08:54,119 : INFO : EPOCH 4 - PROGRESS: at 42.69% examples, 291644 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:08:55,133 : INFO : EPOCH 4 - PROGRESS: at 43.71% examples, 291634 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:08:56,205 : INFO : EPOCH 4 - PROGRESS: at 44.72% examples, 291200 words/s, in_qsize 8, out_qsize 1
2020-02-09 14:08:57,215 : INFO : EPOCH 4 - PROGRESS: at 45.75% examples, 291468 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:08:58,235 : INFO : EPOCH 4 - PROGRESS: at 46.82% examples, 291656 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:08:59,305 : INFO : EPOCH 4 - PROGRESS: at 47.93% examples, 291500 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:09:00,336 : INFO : EPOCH 4 - PROGRESS: at 49.02% examples, 291380 words/s, in_qsiz

2020-02-09 14:10:03,692 : INFO : EPOCH 5 - PROGRESS: at 17.95% examples, 292900 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:10:04,696 : INFO : EPOCH 5 - PROGRESS: at 19.14% examples, 293000 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:10:05,712 : INFO : EPOCH 5 - PROGRESS: at 20.32% examples, 292319 words/s, in_qsize 8, out_qsize 1
2020-02-09 14:10:06,734 : INFO : EPOCH 5 - PROGRESS: at 21.43% examples, 291625 words/s, in_qsize 6, out_qsize 1
2020-02-09 14:10:07,763 : INFO : EPOCH 5 - PROGRESS: at 22.62% examples, 291861 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:10:08,784 : INFO : EPOCH 5 - PROGRESS: at 23.72% examples, 291717 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:10:09,792 : INFO : EPOCH 5 - PROGRESS: at 24.76% examples, 290843 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:10:10,829 : INFO : EPOCH 5 - PROGRESS: at 25.97% examples, 291363 words/s, in_qsize 8, out_qsize 0
2020-02-09 14:10:11,840 : INFO : EPOCH 5 - PROGRESS: at 27.06% examples, 291387 words/s, in_qsiz

2020-02-09 14:11:18,641 : INFO : EPOCH 5 - PROGRESS: at 97.93% examples, 289720 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:11:19,654 : INFO : EPOCH 5 - PROGRESS: at 98.87% examples, 289524 words/s, in_qsize 7, out_qsize 0
2020-02-09 14:11:20,676 : INFO : EPOCH 5 - PROGRESS: at 99.87% examples, 289413 words/s, in_qsize 4, out_qsize 1
2020-02-09 14:11:20,738 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-09 14:11:20,744 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-09 14:11:20,769 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-09 14:11:20,802 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-09 14:11:20,803 : INFO : EPOCH - 5 : training on 28109334 raw words (26792174 effective words) took 92.5s, 289496 effective words/s
2020-02-09 14:11:20,804 : INFO : training on a 140546670 raw words (133955672 effective words) took 485.9s, 275664 effective words/s
2020-02-09 14:11:20,806 

In [9]:
print(model.wv.most_similar(positive=[u'pontes'], negative=[u'presidente']))

2020-02-09 14:11:50,471 : INFO : precomputing L2-norms of word weight vectors


[('viadutos', 0.3577399253845215), ('esburacadas', 0.3394046425819397), ('calçadas', 0.33073341846466064), ('crateras', 0.3250013589859009), ('túneis', 0.3207547664642334), ('Caminhões', 0.3154582977294922), ('desesperdados', 0.31274741888046265), ('desesperdadas', 0.3082699179649353), ('estradas', 0.2998499274253845), ('hidrelétricas', 0.2980271577835083)]


In [10]:
from gensim.models import Word2Vec
new_model = Word2Vec.load('../embeddings/comments_w2v.bin')

2020-02-09 14:11:51,262 : INFO : loading Word2Vec object from ../embeddings/comments_w2v.bin
2020-02-09 14:11:51,457 : INFO : loading wv recursively from ../embeddings/comments_w2v.bin.wv.* with mmap=None
2020-02-09 14:11:51,458 : INFO : loading vectors from ../embeddings/comments_w2v.bin.wv.vectors.npy with mmap=None
2020-02-09 14:11:51,688 : INFO : setting ignored attribute vectors_norm to None
2020-02-09 14:11:51,689 : INFO : loading vocabulary recursively from ../embeddings/comments_w2v.bin.vocabulary.* with mmap=None
2020-02-09 14:11:51,690 : INFO : loading trainables recursively from ../embeddings/comments_w2v.bin.trainables.* with mmap=None
2020-02-09 14:11:51,691 : INFO : loading syn1neg from ../embeddings/comments_w2v.bin.trainables.syn1neg.npy with mmap=None
2020-02-09 14:11:51,920 : INFO : setting ignored attribute cum_table to None
2020-02-09 14:11:51,921 : INFO : loaded ../embeddings/comments_w2v.bin


In [11]:
print(new_model.wv.most_similar(positive=[u'guedes']))

2020-02-09 14:11:52,142 : INFO : precomputing L2-norms of word weight vectors


[('amaciada', 0.6610254049301147), ('paulo', 0.641289234161377), ('onyx', 0.6115303039550781), ('cpmf', 0.592642068862915), ('arminio', 0.584694504737854), ('lorenzoni', 0.5845917463302612), ('fv', 0.5824950933456421), ('fraga', 0.5822524428367615), ('henrique', 0.5743230581283569), ('paim', 0.5703856945037842)]
