# Word Embeddings para clasificación binaria

In [1]:
import spacy
import sklearn
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import KeyedVectors
# Inspiración:
'''https://maxhalford.github.io/blog/unsupervised-text-classification/'''

2021-11-19 14:38:52.665492: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-19 14:38:52.665573: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


'https://maxhalford.github.io/blog/unsupervised-text-classification/'

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/echao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Cargar la Data: reseñas de Amazon en español

In [3]:
# Use test data just because of size
df = pd.read_json('../data/dataset_es_test.json', lines=True, orient='records')

In [4]:
df['review'] = df['review_title'] + ' ' + df['review_body']

In [5]:
df = df[df['stars'] != 3]  # quitar reseñas neutrales

#### Cargar Word Embeddings (Ahora usando "Spanish Billion Word Corpus")

In [6]:
we = KeyedVectors.load_word2vec_format('../data/sbw_vectors.bin', limit=200000, binary=True)

In [7]:
## Pruebas de we

we.doesnt_match(['perro', 'gato', 'conejo', 'humano'])

'humano'

In [129]:
we.similarity("asco", "malo")   # muy disimilares?

0.4132543

In [9]:
print(we.similarity("maravilla", "feliz"))
print(we.similarity("maravilla", "malo"))

0.50715035
0.4186036


In [128]:
print(we.similarity("Bueno", "bien"))

0.3905842


In [10]:
print(we.similarity("como", "alimento"))  # parece que estos WE no capturan bien el significado

0.17277518


#### Functions to clean and tokenize documents

In [31]:
Stopwords = set(stopwords.words('spanish'))

In [32]:
def clean_text(text):
    '''
    Removes punctuation, 
    lowercases, --> word vectors work better in lowercase
    removes extra whitespace
    '''
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation+'¿¡'))
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

In [33]:
clean_text("¡Hola! ¿¿Cómo estás?!")

'hola cómo estás'

In [34]:
def clean_tokenize(text):
    '''
    Tokenizes the text, 
    removes stopwords and 
    words with less than 2 chars
    '''
    text = clean_text(text)
    tokens = text.split()
    return [t for t in tokens if t not in Stopwords and len(t)>1]

In [17]:
clean_tokenize("¡¡hola! nunca pensé   en eso, ¿me podrías indicar cómo?  ")

['hola', 'nunca', 'pensé', 'podrías', 'indicar', 'cómo']

#### Function to compute a document's word embedding centroid

In [35]:
def embedWE(tokens, we):
    '''
    Misma validación de clean_tokenize: no stopwords, tiene que estar en el vocabulario
    '''
    vectors = np.asarray([
        we[token]
        for token in tokens
        if token in we
        and len(token)>1
        and token not in Stopwords
    ])

    if len(vectors) > 0:
        centroid = vectors.mean(axis=0) # mean vector across columns; 300 
    else:
        centroid = np.zeros(300)         # width is 300
        
    return centroid

#### Obtain Labels and their Centroids

In [36]:
def get_label_centroids(labels, we):
    '''
    Regresa los embeddings 
    correspondientes a cada uno de los nombres de clase.
    El nombre de la clase puede tener varias palabras.
    '''
    label_centroids = np.asarray([
        embedWE(name.split(), we)
        for name in labels
    ])
    
    return label_centroids

### Selección de Clusters (labels)

In [120]:
#label_names = ['feliz', 'decepcionado']   # muy influyente en el resultado
#label_names = ['feliz bueno maravilla', 'decepcionado enojado malo']   # muy influyente en el resultado
#label_names = ['bien', 'mal']   # muy influyente en el resultado
label_names = ['excelente', 'decepcionante']   # muy influyente en el resultado


# Etiquetar para fines de medición
df['sentiment'] = np.where(df['stars'] > 3, label_names[0], label_names[1])
df.head(2)

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category,review,sentiment,prediction,spacy_pred
0,es_0038754,product_es_0113523,reviewer_es_0580071,1,no me llego el articulo me lo mando por correo...,no me llego,es,wireless,no me llego no me llego el articulo me lo mand...,decepcionante,feliz bueno maravilla,decepcionado enojado malo
1,es_0748979,product_es_0017036,reviewer_es_0819733,1,"la mensajería horrible, no compro mas",amazon sigue sin cumplir en las entregas,es,home,amazon sigue sin cumplir en las entregas la me...,decepcionante,feliz bueno maravilla,decepcionado enojado malo


In [121]:
from sklearn.neighbors import NearestNeighbors

# With WE
labels_we = get_label_centroids(label_names, we)
# With Spacy
labels_spacy = get_spacy_labels(label_names, nlp)

nb = NearestNeighbors(n_neighbors=1)
nb.fit(labels_spacy)

NearestNeighbors(n_neighbors=1)

In [88]:
# CLASSIFY
def predict(doc, we, nb, label_names):
    tokens = clean_tokenize(doc)  # cap at 50 tokens, could influence result
    centroid = embedWE(tokens, we)
    closest_label = nb.kneighbors([centroid], return_distance=False)[0][0]
    return label_names[closest_label]

In [89]:
example0 = df['review'][0]
example1 = df['review'][1]
example2 = df['review'][2]

In [90]:
predict(example0, we, nb, label_names)

'feliz bueno maravilla'

In [91]:
predict(example1, we, nb, label_names)

'feliz bueno maravilla'

In [92]:
predict(example2, we, nb, label_names)

'feliz bueno maravilla'

#### Probar predicciones

In [93]:
df['prediction'] = df['review'].apply(lambda x: predict(x, we, nb, label_names))

In [82]:
def get_accuracy(predictions, label_col):
    predictions = np.asarray(predictions)
    label_col = np.asarray(label_col)
    num_correct = 0
    for i in range(len(predictions)):
        num_correct += 1*(predictions[i]==label_col[i])

    return num_correct/len(predictions)

In [94]:
get_accuracy(df['prediction'], df['sentiment'])

# mala precisión!!!!

0.50275

### SpaCy es model

Descargamos el modelo pre-entrenado de Spacy para español, `es_core_news_lg`.
Este modelo contiene embeddings tanto para palabras como para lexemas en español, a través de la clase Vocab.
Siguiendo el ejemplo en inglés, usamos los vectores de los lexemas.
No obstante, los tokens devueltos por el objeto `nlp` tienen una propiedad `vector`.


In [95]:
nlp = spacy.load('es_core_news_lg')

In [96]:
def embed(tokens, nlp):
    """
    Returns the centroid of embeddings of the given tokens
    Out-of-vocabulary and stopwords are ignored
    If no tokens are valid, zero vector is returned
    """
    lexemes = (nlp.vocab[token] for token in tokens)
    
    vectors = np.asarray([
        lexeme.vector
        for lexeme in lexemes
        if lexeme.has_vector
        and not lexeme.is_stop
        and len(lexeme.text) > 1
    ])

    if len(vectors) > 0:
        centroid = vectors.mean(axis=0)
    else:
        width = nlp.meta['vectors']['width']  # fastText Wikipedia Spanish
        centroid = np.zeros(width)

    return centroid

In [84]:
def embedText(text, nlp):
    tokens = nlp(text)
    vectors = np.asarray([
        tok.vector
        for tok in tokens
        if tok.has_vector
        and not tok.is_stop
        and len(tok.text) > 1
    ])
    if len(vectors) > 0:
        centroid = vectors.mean(axis=0)
    else:
        width = nlp.meta['vectors']['width']  # fastText Wikipedia Spanish
        centroid = np.zeros(width)

    return centroid

In [97]:
example0 = "Jamás había estado tan enojado por haberme olvidado del cumpleaños de mi novia."
example1 = "Los perros son criaturas hermosas, yo pienso que hacen una muy buena compañía."
example2 = "No todos los bares de café son buenos. La semana pasada fui a Quentin y estuvo terrible!"

In [122]:
label_names

['excelente', 'decepcionante']

In [108]:
def get_spacy_labels(label_names, nlp):
    '''
    Dada una lista de nombres de clase, regresa los embeddings 
    correspondientes a cada una.
    El nombre de la clase puede tener varias palabras.
    '''
    
    label_embeddings = np.asarray([
        embed(name.split(), nlp)
        for name in label_names
    ])
    
    return label_embeddings

In [111]:
def predictSpaCy(doc, nlp, nb, label_names):
    tokens = clean_tokenize(doc)  # cap at 50 tokens, could influence result
    centroid = embed(tokens, nlp)
    closest_label = nb.kneighbors([centroid], return_distance=False)[0][0]
    return label_names[closest_label]

In [113]:
labels_s = get_spacy_labels(label_names, nlp)

nb = NearestNeighbors(n_neighbors=1)
nb.fit(labels_s)

NearestNeighbors(n_neighbors=1)

In [123]:
predictSpaCy(example0, nlp, nb, label_names)

'decepcionante'

In [124]:
predictSpaCy(example1, nlp, nb, label_names)

'decepcionante'

In [125]:
predictSpaCy(example0, nlp, nb, label_names)

'decepcionante'

In [126]:
#Spacy Predictions
df['spacy_pred'] = df['review'].apply(lambda x: predictSpaCy(x, nlp, nb, label_names))


get_accuracy(df['spacy_pred'], df['sentiment'])

0.51625

# Clasificación no supervisada con spaCy

In [91]:
label_names = ["excelente", "decepcionante"]
label_embeddings = get_label_embeddings(label_names, nlp)

In [108]:
from sklearn.neighbors import NearestNeighbors

nb = NearestNeighbors(n_neighbors=2)
nb.fit(label_embeddings)  # label centroids are training data for clusters

NearestNeighbors(n_neighbors=2)

In [109]:
# "Jamás había estado tan enojado por haberme olvidado del cumpleaños de mi novia"
closest_label = nb.kneighbors([centroid0], return_distance=False)[0, 0]
label_names[closest_label]

'decepcionante'

### Otro intento: ahora con 500-Dimensional vectors, 5-window [Bilbao, Almeida]

In [13]:
import gensim.models
FILE_PATH = '/mnt/c/Users/ericj/Downloads/keyed_vectors/complete.kv'

we3 = KeyedVectors.load(FILE_PATH)

ModuleNotFoundError: No module named 'gensim.models.deprecated'