# Word Embeddings Comment's News 2018

The data are comments from news collected in 2018 from the following Brazilian newspapers:

* `O Antagonista`, 
* `O Globo`, 
* `Veja`.

A detailed analysis of the data is available [here](https://pages.github.com/). This notebook's objective is to use the word2vec model to generate embeddings from the texts of these comments. The architecture used by the model is skip-gram, each word is represented by a vector of 300 dimensions.

In [None]:
import re
import nltk
import gensim, logging
import pandas as pd
from nltk.corpus import stopwords
from pymongo import MongoClient
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
PUNCTUATION = u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ%]' # define news punctuation 

### Defining lexicons and loading dicts

In [None]:
# loading abbreviation dic
with open('../dics/AB_dict') as f:
    abbreviation = f.readlines()
# process dic    
abbreviation = [x.split() for x in abbreviation]
abbreviation = {line[0]: line[1] for line in abbreviation}

# loading internet_slang dic
with open('../dics/IN_dict') as f:
    internet_slang = f.readlines()
# process dic    
internet_slang = [x.split() for x in internet_slang]
internet_slang = {line[0]: ' '.join(line[1:]) for line in internet_slang}

# loading foreign_word dic
with open('../dics/ES_dict') as f:
    foreign_word = f.readlines()
# process dic    
foreign_word = [x.split() for x in foreign_word]
foreign_word = [line for line in foreign_word if len(line) > 1] # selecting valid lines
foreign_word = {line[0]: ' '.join(line[1:]) for line in foreign_word}

# Mapping words in lexicons
map_lexicons = {'a ponto':'a_ponto','ao menos ':'ao_menos ','ate mesmo ':'ate_mesmo ',
                'nao mais que ':'nao_mais_que ','nem mesmo ':'nem_mesmo ','no minimo ':'no_minimo ',
                'o unico ':'o_unico ','a unica ':'a_unica ','pelo menos ':'pelo_menos ',
                'quando menos ':'quando_menos ','quando muito ':'quando_muito ','a par disso ':'a_par_disso ',
                'e nao ':'e_nao ','em suma ':'em_suma ','mas tambem ': 'mas_tambem ','muito menos ':'muito_menos ',
                'nao so ':'nao_so ','ou mesmo ':'ou_mesmo ','por sinal ':'por_sinal ','com isso ':'com_isso ',
                'como consequencia ':'como_consequencia ','de modo que ':'de_modo_que ','deste modo ':'deste_modo ',
                'em decorrencia ':'em_decorrencia ','nesse sentido ':'nesse_sentido ','por causa ':'por_causa ',
                'por conseguinte ':'por_conseguinte ','por essa razao ':'por_essa_razao ','por isso ':'por_isso ',
                'sendo assim ':'sendo_assim ','ou entao ':'ou_entao ','ou mesmo ':'ou_mesmo ','como se ':'como_se ',
                'de um lado ':'de_um_lado ','por outro lado ':'por_outro_lado ','mais que ':'mais_que ',
                'menos que ':'menos_que ','desde que ':'desde_que ','do contrario ':'do_contrario ',
                'em lugar ':'em_lugar ','em vez ':'em_vez','no caso ':'no_caso ','se acaso ':'se_acaso ',
                'de certa forma ':'de_certa_forma ','desse modo ':'desse_modo ','em funcao ':'em_funcao ',
                'isso e ':'isso_e ','ja que ':'ja_que ','na medida que ':'na_medida_que ','nessa direcao ':'nessa_direcao ',
                'no intuito ':'no_intuito ','no mesmo sentido ':'no_mesmo_sentido ','ou seja ':'ou_seja ',
                'uma vez que ':'uma_vez_que ','tanto que ':'tanto_que ','visto que ':'visto_que ','ainda que ':'ainda_que ',
                'ao contrario ':'ao_contrario ','apesar de ':'apesar_de ','fora isso ':'fora_isso ','mesmo que ':'mesmo_que ',
                'nao obstante ':'nao_obstante ','nao fosse isso ':'nao_fosse_isso ','no entanto ':'no_entanto ',
                'para tanto ':'para_tanto ','pelo contrario ':'pelo_contrario ','por sua vez ':'por_sua_vez ','posto que ':'posto_que '
               }

### Defining Functions

In [None]:
# Convert word from text into lexicons
def word2lexicon(text):
    for k, v in map_lexicons.items():
        text = str(text).replace(k,v)
    return text

# Lexical normalization
def lexical_normalization(text):
    for k, v in abbreviation.items():
        text = str(text).replace(k,v)        
    for k, v in internet_slang.items():
        text = str(text).replace(k,v)        
    for k, v in foreign_word.items():
        text = str(text).replace(k,v)
    return text

# function for processing sentences
def process_sentences(text):
    stop_words = stopwords.words('portuguese') # load stop words
    text = re.sub(PUNCTUATION, ' ', str(text)) # remove punctuation from text
    text = str(text).split() # split sentences by words
    text = [word for word in text if word not in stop_words] # Remove stopwords
    return text

### Carregando Comentários

In [None]:
HOST_IP = '192.168.1.7'
# init mongo client
client = MongoClient(HOST_IP, 27017)

#select db
db = client['news_2018']

# load data
oantagonistaComments = pd.DataFrame(list(db.get_collection('oantagonistaComments').find()))
ogloboComments = pd.DataFrame(list(db.get_collection('ogloboComments').find()))
vejaComments = pd.DataFrame(list(db.get_collection('vejaComments').find()))

# concat all comments
comments = pd.concat((oantagonistaComments, ogloboComments, vejaComments), sort=False, ignore_index=True)

### Processing Comments Text

In [None]:
# processing comments text
comments['text'] = comments['text'].apply(lexical_normalization)
comments['text'] = comments['text'].apply(word2lexicon)  
comments['text'] = comments['text'].apply(process_sentences)

### Treinando Word2Vec

In [None]:
# Train word2vec model - settings: approach skip-gram, size embeddings vectors 300 
model = gensim.models.Word2Vec(comments['text'], workers=4, size=300, sg=1, window=5, min_count=5)
# Saving model
model.save('../embeddings/comments_w2v.bin')
# Saving embeddings
model.wv.save_word2vec_format("../embeddings/comments_vectors.bin")

#### Example

In [None]:
print(model.wv.most_similar(positive=[u'pontes'], negative=[u'presidente']))

# loading embeddings
from gensim.models import Word2Vec
new_model = Word2Vec.load('../embeddings/comments_w2v.bin')
print(new_model.wv.most_similar(positive=[u'pontes'], negative=[u'presidente']))