# Learning embeddings (Word2vec/Doc2Vec).

In [1]:
import random
import numpy as np
import re
import time
import pickle
import unidecode

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/maiapolo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Preparing: opening auxiliary files and defining functions

Function that cleans texts:

In [2]:
def clean(resulta):   
    import copy 
   
    result = copy.deepcopy(resulta)
    
    result=result.lower()
    result=" "+result #colocando espaço no começo
    result=re.sub('\d', ' ', result)
    result=result.replace("lei ", "lei_")
    result=result.replace("lei nº ", "lei_")
    result=result.replace("lei n.º" ,"lei_")
    result=result.replace("lei estadual nº ", "lei_") 
    result=result.replace("lei federal nº ", "lei_") 
    result=result.replace("lei municipal nº ", "lei_")
    result=result.replace("fl. ", "fls. ")
    result=result.replace("fls. ", "fls_") 
    result=result.replace("p. ", "pp. ")
    result=result.replace("pp. ", "pp_")
    result=result.replace("art. ", "art_") 
    result=result.replace("artigo ", "art_")
    result=result.replace("inciso ", "inciso_") 
    result=result.replace("nº ", "nº_")
    result=result.replace("n° ", "nº_")
    result=result.replace("º ", "º")
    result=result.replace("ª ", "ª")
    result=result.replace("oab ", "oab_")
    result=result.replace("r$ ", "r$_")
    result=result.replace("\n", " ")
    result=result.replace("dr ", "dr_")
    result=result.replace("dr. ", "dr_")
    result=result.replace("dra ", "dr_")
    result=result.replace("dra. ", "dr_")
    result=result.replace("adv: ", "adv_") 
    
    result=result.replace("/", " ")
    result=result.replace("|", " ")
    result=result.replace("+", " ")
    result=result.replace(".", " ")
    result=result.replace(",", " ")
    result=result.replace(":", " ")
    result=result.replace(";", " ")
    result=result.replace("!", " ")
    result=result.replace("?", " ")
    result=result.replace(">", " ")
    result=result.replace("=", " ")
    result=result.replace("§", " ")
    result=result.replace(" - ", " ")
    result=result.replace(" _ ", " ")
    result=result.replace("&", " ")
    result=result.replace("*", " ")
    result=result.replace("(", " ")
    result=result.replace(")", " ")
    result=result.replace("ª", " ")
    result=result.replace("º", " ")
    result=result.replace("%", " ")
    result=result.replace("[", " ")
    result=result.replace("]", " ")
    result=result.replace("{", " ")
    result=result.replace("}", " ")
    result=result.replace("'", " ")
    result=result.replace('"', " ")
    result=result.replace("“", " ")
    result=result.replace("”", " ")
    result=re.sub(' +', ' ', result)
    result=result+" " #colocando espaço no fim
    
    return(result)

Tokenizer:

In [3]:
#Tokenizador de textos
stop_words = set(stopwords.words('portuguese'))

def tokenize(txt):
    texto=txt
    texto=texto.split(' ') 
    tokens=[]
    for t in texto:
        if t not in stop_words: tokens.append(t)
        else: pass
 
    return(tokens)

Opening unlabeled motions dataset:

In [4]:
with open("bases/mov_treino.txt", "rb") as fp:   # Unpickling
    mov = pickle.load(fp)

Cleaning texts and storing them in a list:

In [5]:
start_time = time.time()
texts_mov=[]

for i in range(len(mov)):
    for j in range(len(mov[i])):
        texts_mov.append(clean(unidecode.unidecode(mov[i][j][1])))
print(round((time.time() - start_time)/60,2),"minutos")

3.36 minutos


In [6]:
len(texts_mov)

2904255

Tokenizing texts

In [7]:
start_time = time.time()
sentence_stream = [tokenize(doc) for doc in texts_mov[:]]
print(round((time.time() - start_time)/60,2),"minutos")

0.33 minutos


Learning which combinations of words should be considered as unique tokens:

In [8]:
start_time = time.time()
bigrams=Phrases(sentence_stream)#, threshold=1)
bibigrams=Phrases(bigrams[sentence_stream])# , threshold=1)
print(round((time.time() - start_time)/60,2),"minutos")

6.6 minutos


And then, transforming the texts...

In [9]:
start_time = time.time()
for i in range(len(sentence_stream)):
    sentence_stream[i]=bibigrams[bigrams[sentence_stream[i]]]
print(round((time.time() - start_time)/60,2),"minutos")

6.42 minutos


Saving algorithms:

In [10]:
bigrams.save('modelos/bigrams_mov')
bibigrams.save('modelos/bibigrams_mov')

Learning Word2vec/Doc2vec representations:

In [11]:
sizes=[100, 150, 200] 
windows=[5, 10]

In [12]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentence_stream)]

Running the model:

In [13]:
start_time = time.time()

for s in sizes:
    for w in windows:
        model = Doc2Vec(documents, vector_size=s, window=w, seed=1)
        model.save('modelos/doc2vec_mov_'+str(s)+'_'+str(w)+'_V5')
        
print(round((time.time() - start_time)/60,2),"minutos")

528.65 minutos
