# Practico 5 (parte 4)

## Entrenar word embeddings

## Importación de módulos y librerías

In [1]:
# Inclusion de librerias y módulos
import os
import logging
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Usamos las stopwords definidas en nltk más algunas propias
from nltk.corpus import stopwords
stopwords = stopwords.words('english') + [',', "’", '.', ':', '-', ';']

# Algunas utilidades
from utiles import print_some_info
from utiles import convert_emojis
from utiles import convert_emoticons

# Nos permite convertir str a list
from ast import literal_eval

# Importamos wrod2vec de la lib gensim
from gensim.models import Word2Vec

# Importamos logger para tener informacion de estado
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Colores
BLUE   = '#5DADE2'
RED    = '#ff7043'
ORANGE = '#F5B041'
GREEN  = '#58D68D'
YELLOW = '#F4D03F'
pltcolors = [BLUE, RED, ORANGE, GREEN, YELLOW]

# Plot axes y legends parambs
plt.rcParams["axes.labelweight"]   = "bold"
plt.rcParams["axes.titleweight"]   = "bold"
plt.rcParams["legend.shadow"]      = True
plt.rcParams["figure.titleweight"] = "bold"

data_dir = os.path.join('..', 'dataset')

filename = 'dev_yup_messages_preprocessed.csv'
SAVE_CURATED_DATASET = False

## Lectura del archivo de mensajes
Utilizamos unicamente el archivo de mensajes dado que vamos a entrenar un word embeding como word2vec. Entendemos que para el propósito del análisis y por que no estamos empleando ningún modelo de clasificación o regresión podemos usar el conjunto de datos completo.

In [2]:
if SAVE_CURATED_DATASET:
    df = pd.read_csv(os.path.join(data_dir, filename))

    print(f'El conjunto de datos utilizado es {filename}')
    print_some_info(df)
else:
    print('Curación evitada')

Curación evitada


## Curacion del dataset

In [3]:
fn = os.path.join(data_dir, filename.replace('.csv','_curated.csv'))
if SAVE_CURATED_DATASET:
    #1. Tomamos solo las columnas que nos pueden servir. Esto es preliminar, podríamos tomar solo `text`
    dfclean = df[['session_id', 'sent_from', 'text']]

    #2. Tomamos solo las filas que sean tutor o student a partir de la columna `sent_from`
    dfclean = dfclean[dfclean.sent_from.isin(['student', 'tutor'])]

    #3. Convertimos a lista de strings el contenido de la columna text
    dfclean['text'] = dfclean.text.apply(lambda x: literal_eval(x))

    #4. Se sustituyen emojis por tokens 
    dfclean['text'] = dfclean.text.apply(lambda x: [convert_emojis(w) for w in x])

    #5. Se sustituyen emoticones por palabras
    ## No lo vamos a tratar por ahora por que requiere de un mejor tratamiento. 
    # Los parentesis, llaves y corchetes parece que el uso regular afecta al manejo del emoticon. 
    # dfclean['text'] = dfclean.text.apply(lambda x: [convert_emoticons(w) for w in x])

    #6. Convernitimos a minúsculas para unificar el tratamiento
    dfclean['text'] = dfclean.text.apply(lambda x: [w.lower() for w in x])

    #7. Removemos las stopwords
    dfclean['text'] = dfclean.text.apply(lambda x: [w for w in x if w not in stopwords])

    dfclean.to_csv(fn, index=False)
else:
    #8. Se carga el archivo curado 
    dfclean = pd.read_csv(fn)
    
    #9. Convertimos a lista de strings el contenido de la columna text
    dfclean['text'] = dfclean.text.apply(lambda x: literal_eval(x))

print_some_info(dfclean)

El conjunto de datos posee 210242 filas y 3 columnas
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210242 entries, 0 to 210241
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   session_id  210242 non-null  int64 
 1   sent_from   210242 non-null  object
 2   text        210242 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.8+ MB
None


In [24]:
size = 100
window = 5
min_count = 1
sg = 0

params = f'{size}-{window}-{min_count}-{sg}'
fnmodel = f'{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}_model_{params}.bin'

model = Word2Vec(list(dfclean.text), size=size, window=window, min_count=min_count, sg=sg, compute_loss=True, workers=4)
model.save(fnmodel)

2020-10-06 20:42:15,641 : INFO : collecting all words and their counts
2020-10-06 20:42:15,644 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-10-06 20:42:15,689 : INFO : PROGRESS: at sentence #10000, processed 42888 words, keeping 3498 word types
2020-10-06 20:42:15,714 : INFO : PROGRESS: at sentence #20000, processed 84921 words, keeping 5278 word types
2020-10-06 20:42:15,743 : INFO : PROGRESS: at sentence #30000, processed 126737 words, keeping 6775 word types
2020-10-06 20:42:15,773 : INFO : PROGRESS: at sentence #40000, processed 167356 words, keeping 8082 word types
2020-10-06 20:42:15,808 : INFO : PROGRESS: at sentence #50000, processed 208696 words, keeping 9217 word types
2020-10-06 20:42:15,828 : INFO : PROGRESS: at sentence #60000, processed 247775 words, keeping 10307 word types
2020-10-06 20:42:15,847 : INFO : PROGRESS: at sentence #70000, processed 288680 words, keeping 11328 word types
2020-10-06 20:42:15,874 : INFO : PROGRESS: at sentenc

In [15]:
model.wv.most_similar('thanks')

[('thank', 0.8510875701904297),
 ('appreciate', 0.6383005380630493),
 ('pleasure', 0.6031033992767334),
 ('glad', 0.5878739953041077),
 ('bye', 0.585172712802887),
 ('efforts', 0.5836899876594543),
 ('today', 0.5703210830688477),
 ('yay', 0.5505203008651733),
 ('course', 0.5434335470199585),
 ('night', 0.5270049571990967)]

In [25]:
model.get_latest_training_loss()

3283660.5