# Practico 5 (parte 4)

## Entrenar word embeddings

## Importación de módulos y librerías

In [17]:
# Inclusion de librerias y módulos
import os
import logging
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Usamos las stopwords definidas en nltk más algunas propias
from nltk.corpus import stopwords

stopwords = stopwords.words('english') + [',', "’", '.', ':', '-', ';']

# Algunas utilidades
from utiles import print_some_info

# Nos permite convertir str a list
from ast import literal_eval

# Importamos wrod2vec de la lib gensim
from gensim.models import Word2Vec

# Importamos logger para tener informacion de estado
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Colores
BLUE   = '#5DADE2'
RED    = '#ff7043'
ORANGE = '#F5B041'
GREEN  = '#58D68D'
YELLOW = '#F4D03F'
pltcolors = [BLUE, RED, ORANGE, GREEN, YELLOW]

# Plot axes y legends parambs
plt.rcParams["axes.labelweight"]   = "bold"
plt.rcParams["axes.titleweight"]   = "bold"
plt.rcParams["legend.shadow"]      = True
plt.rcParams["figure.titleweight"] = "bold"

data_dir = os.path.join('..', 'dataset')

SAVE_CURATED_DATASET = True

## Lectura del archivo de mensajes
Utilizamos unicamente el archivo de mensajes dado que vamos a entrenar un word embeding como word2vec. Entendemos que para el propósito del análisis y por que no estamos empleando ningún modelo de clasificación o regresión podemos usar el conjunto de datos completo.

In [2]:
filename = 'dev_yup_messages_preprocessed.csv'
df = pd.read_csv(os.path.join(data_dir, filename))

print(f'El conjunto de datos utilizado es {filename}')
print_some_info(df)


El conjunto de datos utilizado es dev_yup_messages_preprocessed.csv
El conjunto de datos posee 234375 filas y 6 columnas
&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 234375 entries, 0 to 234374
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   session_id    234375 non-null  int64 
 1   created_at    234375 non-null  object
 2   sent_from     234375 non-null  object
 3   sent_to       234375 non-null  object
 4   content_type  234375 non-null  object
 5   text          234375 non-null  object
dtypes: int64(1), object(5)
memory usage: 10.7+ MB
None


In [3]:
import re
import emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# Funcion para convertir emojis a palabras
def convert_emojis(text):
    text0 = [t for t in re.findall(r'\\x..\\x..\\x..\\x..', str(text.encode()))]
    text0 = [''.join(t.split('\\x')[1:]) for t in text0]
    text0 = [bytes.fromhex(t).decode() for t in text0 if t[0]=='f']
    text0 = [UNICODE_EMO[t] for t in text0 if t in UNICODE_EMO]
    text0 = ' '.join(text0)
    return text0 if text0 else text

# Funcion para convertir emoticones a palabras
OUREMOTIC = dict([(e, f":{EMOTICONS[e].lower().split(',')[0].replace('or ','').replace(' ','_')}:") for e in EMOTICONS.keys()])
def convert_emoticons(text):
    try:
        text0 = emot.emoticons(text)
        if text0['flag']:
            return ':'+text0['mean'][0].replace(' ', '_').lower()+':'
        return text
    except Exception as e:
        return text

In [4]:
#1. Tomamos solo las columnas que nos pueden servir. Esto es preliminar, podríamos tomar solo `text`
dfclean = df[['session_id', 'sent_from', 'text']]

#2. Tomamos solo las filas que sean tutor o student a partir de la columna `sent_from`
dfclean = dfclean[dfclean.sent_from.isin(['student', 'tutor'])]

#3. Convertimos a lista de strings el contenido de la columna text
dfclean['text'] = dfclean.text.apply(lambda x: literal_eval(x))

#4. Se sustituyen emojis por tokens 
dfclean['text'] = dfclean.text.apply(lambda x: [convert_emojis(w) for w in x])

#5. Se sustituyen emoticones por palabras
## No lo vamos a tratar por ahora por que requiere de un mejor tratamiento. 
# Los parentesis, llaves y corchetes parece que el uso regular afecta al manejo del emoticon. 
# dfclean['text'] = dfclean.text.apply(lambda x: [convert_emoticons(w) for w in x])

#6. Convernitimos a minúsculas para unificar el tratamiento
dfclean['text'] = dfclean.text.apply(lambda x: [w.lower() for w in x])

#7. Removemos las stopwords
dfclean['text'] = dfclean.text.apply(lambda x: [w for w in x if w not in stopwords])

print_some_info(dfclean)

El conjunto de datos posee 210242 filas y 3 columnas
&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 210242 entries, 0 to 234374
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   session_id  210242 non-null  int64 
 1   sent_from   210242 non-null  object
 2   text        210242 non-null  object
dtypes: int64(1), object(2)
memory usage: 6.4+ MB
None


In [12]:
model = Word2Vec(list(dfclean.text), size=100, window=5, min_count=1, workers=4)

2020-10-06 19:30:34,939 : INFO : collecting all words and their counts
2020-10-06 19:30:34,940 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-10-06 19:30:34,959 : INFO : PROGRESS: at sentence #10000, processed 42888 words, keeping 3498 word types
2020-10-06 19:30:34,973 : INFO : PROGRESS: at sentence #20000, processed 84921 words, keeping 5278 word types
2020-10-06 19:30:34,988 : INFO : PROGRESS: at sentence #30000, processed 126737 words, keeping 6775 word types
2020-10-06 19:30:35,004 : INFO : PROGRESS: at sentence #40000, processed 167356 words, keeping 8082 word types
2020-10-06 19:30:35,021 : INFO : PROGRESS: at sentence #50000, processed 208696 words, keeping 9217 word types
2020-10-06 19:30:35,037 : INFO : PROGRESS: at sentence #60000, processed 247775 words, keeping 10307 word types
2020-10-06 19:30:35,053 : INFO : PROGRESS: at sentence #70000, processed 288680 words, keeping 11328 word types
2020-10-06 19:30:35,067 : INFO : PROGRESS: at sentenc

In [16]:
model.most_similar('thanks')

2020-10-06 19:37:45,025 : INFO : precomputing L2-norms of word weight vectors


[(&#39;thank&#39;, 0.8911322355270386),
 (&#39;thx&#39;, 0.7975543737411499),
 (&#39;appreciate&#39;, 0.7357184886932373),
 (&#39;glad&#39;, 0.7197262048721313),
 (&#39;thks&#39;, 0.7142689228057861),
 (&#39;thankyou&#39;, 0.7096688747406006),
 (&#39;bye&#39;, 0.6871362924575806),
 (&#39;love&#39;, 0.6796211004257202),
 (&#39;accuracy&#39;, 0.679126501083374),
 (&#39;hear&#39;, 0.6663104295730591)]