<a href="https://colab.research.google.com/github/gmauricio-toledo/NLP-MCD/blob/main/09-WordEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Word and Document Embeddings</h1>

En esta notebook exploraremos el uso de distintos embeddings para resolver.algunas tareas del NLP.

Los puntos principales de esta notebook son:

In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
from string import punctuation

nltk.download('punkt_tab')
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('spanish')

In [None]:
!pip install -qq umap-learn
!pip install gensim

In [None]:
def normalizar_vector(v):
    if np.linalg.norm(v) == 0:
        return v
    else:
        return v / np.linalg.norm(v)

# Preprocesamiento y limpieza del texto

In [None]:
url = "https://raw.githubusercontent.com/gmauricio-toledo/NLP-MCD/main/data/spanish-wikipedia-dataframe.csv"
df = pd.read_csv(url,index_col=0)
df.drop(columns='doc_id',inplace=True)
df

In [None]:
docs_raw = df['Texto'].tolist()
docs = [re.sub(r'\d+', ' ', doc) for doc in docs_raw]
tokenized_docs = [word_tokenize(doc) for doc in docs]
docs = [' '.join(doc) for doc in tokenized_docs]
docs[:3]

In [None]:
max_features = 2500

cv = CountVectorizer(max_features=max_features,
                     stop_words=stopwords)
X_bow = cv.fit_transform(docs)
X_bow.shape

**Observación** Sobre las clases sparse de scipy

In [None]:
print(X_bow)

Analicemos la naturaleza sparse de estas representaciones

In [None]:
import numpy as np

idx = np.random.choice(max_features,size=1)

word = cv.get_feature_names_out()[idx]
word_dim = X_bow.shape[0]
bow_vector = X_bow.toarray()[:,idx]
print(f"palabra: {word}")
zeros = np.where(bow_vector==0)[0].shape[0]
print(f"Número de entradas cero: {zeros}/{word_dim}={round(100*zeros/word_dim,2)}%")

# word2vec

Usaremos la implementación de gensim: https://radimrehurek.com/gensim/models/word2vec.html.

Tensorflow también tiene una implementación ([tutorial](https://www.tensorflow.org/text/tutorials/word2vec)).

El artículo original: https://arxiv.org/pdf/1301.3781

## Usar un modelo pre-entrenado

### Un modelo de gensim

Gensim tiene varios modelos de word2vec preentrenados:

In [None]:
import gensim.downloader

for x in gensim.downloader.info()['models'].keys():
    print(x)

Descarguemos alguno de estos modelos.

Tarda alrededor de 20 minutos

In [None]:
import gensim.downloader

# pt_w2v_model = gensim.downloader.load('glove-wiki-gigaword-50')
pt_w2v_model = gensim.downloader.load('word2vec-google-news-300')

Obtengamos los vectores

In [None]:
vectors = pt_w2v_model.vectors
vectors.shape

Realicemos algunas tareas de similitud:

Resolvamos algunas analogías:

### Un modelo *externo*

Descarguemos un modelo externo y experimentemos con él

In [None]:
!gdown 0B7XkCwpI5KDYNlNUTTlSS21pQmM

Leamos el modelo, **tarda alrededor de 2 minutos**

In [None]:
from gensim.models import KeyedVectors

pretrained_model_path = 'GoogleNews-vectors-negative300.bin.gz'

pt_w2v_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

### Experimentación

Dimensión de los embeddings

In [None]:
pt_w2v_model.vector_size

In [None]:
vocabulary = pt_w2v_model.index_to_key
print(f"Tamaño del vocabulario: {len(vocabulary)}")
print(vocabulary[:20])

In [None]:
word = "king"

pt_w2v_model.most_similar(word,topn=15)

In [None]:
pt_w2v_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)
# pt_w2v_model.most_similar(positive=['woman', 'actor'], negative=['man'], topn=5)

Veamos la similitud coseno entre palabras _similares_ y _no similares_

In [None]:
# Palabras no similares
word1 = "dream"
word2 = "technology"
similarity1 = pt_w2v_model.similarity(word1, word2)
print(similarity1)

# Palabras relativamente similares
word5 = "computer"
word6 = "pencil"
similarity3 = pt_w2v_model.similarity(word5, word6)
print(similarity3)

# Palabras muy similares
word3 = "boy"
word4 = "girl"
similarity2 = pt_w2v_model.similarity(word3, word4)
print(similarity2)

Exploremos la geometría de estos embeddings:

In [None]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
word = 'spain'

word_vector = pt_w2v_model[word]

dim = pt_w2v_model.vector_size
print(f"palabra: {word}")
zeros = np.where(word_vector==0)[0].shape[0]
print(f"Número de entradas 0: {zeros}/{dim}={round(100*zeros/dim,2)}%")

In [None]:
pt_w2v_model['spain']

Veamos las direcciones entre pares de palabras, las cuales codifican la relación semántica entre ellas

In [None]:
word1 = 'sweden'
word2 = 'stockholm'
word3 = 'france'
word4 = 'paris'
word5 = 'germany'
word6 = 'berlin'

w1 = pt_w2v_model[word1]
w2 = pt_w2v_model[word2]
w3 = pt_w2v_model[word3]
w4 = pt_w2v_model[word4]
w5 = pt_w2v_model[word5]
w6 = pt_w2v_model[word6]

print(f"Palabras a analizar:\n{word1}-{word2}\n{word3}-{word4}\n{word5}-{word6}")
dif_1, dif_2, dif_3 = w1 - w2, w3 - w4, w5 - w6
print("Similitud entre las diferencias:")
print(cosine_similarity(dif_1, dif_2),cosine_similarity(dif_1, dif_3), cosine_similarity(dif_2, dif_3))

# Ahora con palabras aleatorias:
six_random_words = np.random.choice(vocabulary,size=6,replace=False)
print(f"\nEl mismo análisis con 6 palabras aleatorias:\n{six_random_words}")
rw1 = pt_w2v_model[six_random_words[0]]
rw2 = pt_w2v_model[six_random_words[1]]
rw3 = pt_w2v_model[six_random_words[2]]
rw4 = pt_w2v_model[six_random_words[3]]
rw5 = pt_w2v_model[six_random_words[4]]
rw6 = pt_w2v_model[six_random_words[5]]

dif_1, dif_2, dif_3 = rw1 - rw2, rw3 - rw4, rw5 - rw6
print("Similitud entre las diferencias:")
print(cosine_similarity(dif_1, dif_2),cosine_similarity(dif_1, dif_3), cosine_similarity(dif_2, dif_3))

## Entrenar un modelo en el corpus

Es importante considerar que el modelo depende mucho de los datos con los que se entrena. Para muchas tareas generales basta con utilizar un modelo preentrenado, pero algunas aplicaciones específicas (por ejemplo, específicas de un dominio especializado) pueden requerir entrenar un modelo en un corpus específico.

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=5, workers=4)

Veamos el vocabulario obtenido

In [None]:
vocabulary = w2v_model.wv.index_to_key
print(f"Tamaño del vocabulario: {len(vocabulary)}")
print(vocabulary[:20])

Los vectores:

In [None]:
word_vectors = w2v_model.wv.vectors
word_vectors.shape

In [None]:
# Guardar todo el modelo
w2v_model.save("word2vec.model")

# Guardar sólo los vectores
w2v_model.wv.save("word2vec.wordvectors")

In [None]:
word = 'amplia'

word_vector = w2v_model.wv[word]

dim = w2v_model.wv.vector_size
print(f"palabra: {word}")
zeros = np.where(word_vector==0)[0].shape[0]
print(f"Número de entradas 0: {zeros}/{dim}={round(100*zeros/dim,2)}%")

Veamos qué pasa con las palabras **OOV**

In [None]:
w2v_model.wv['holonomia']

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,1))
plt.suptitle("Word2Vec")
plt.imshow(word_vector.reshape(1,-1))
plt.yticks([])
plt.show()

plt.figure(figsize=(10,3))
plt.suptitle("BOW")
plt.imshow(bow_vector.reshape(27,-1))
plt.yticks([])
plt.show()

In [None]:
#@title Grafiquemos la reducción de dimensionalidad 3d t-SNE

# from sklearn.manifold import TSNE
# import plotly.graph_objects as go
# import plotly

# vocabulary = w2v_model.wv.index_to_key
# word_vectors = w2v_model.wv.vectors

# tsne = TSNE(n_components=3, metric='cosine')
# X_tsne = tsne.fit_transform(word_vectors)

# plotly.offline.init_notebook_mode()

# trace = go.Scatter3d(
#     x=X_tsne[:,0],
#     y=X_tsne[:,1],
#     z=X_tsne[:,2],
#     mode='markers',
#     marker={
#         'size': 3,
#         'opacity': 0.75,
#         'color': 'black'
#     },
#     hovertemplate='%{text}<extra></extra>',
#     text = [f"{vocabulary[j]}" for j in range(X_tsne.shape[0])]
# )

# layout = go.Layout(
#     margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
# )

# data = [trace]

# plot_figure = go.Figure(data=data, layout=layout)

# plot_figure.update_layout(
#     title = 'Wikipedia Words',
#     scene = dict(
#         xaxis = dict(visible=False),
#         yaxis = dict(visible=False),
#         zaxis =dict(visible=False)
#         )
#     )

# plotly.offline.plot(plot_figure, filename='/content/drive/MyDrive/Colab Notebooks/NLP/Mi curso/wiki-w2v-tsne3d-words.html')

## Vectores de documentos

Algunas técnicas para obtener vectores de documentos:

* Promediar los vectores de word2vec. Según Le y Mikolov, este enfoque no funciona bien para tareas de análisis de sentimientos, porque «pierde el orden de las palabras del mismo modo que los modelos estándar de bolsa de palabras» y «no reconoce muchos fenómenos lingüísticos sofisticados, como el sarcasmo». Por otro lado, según Kenter et al. 2016, «promediar simplemente las incrustaciones de palabras de todas las palabras de un texto ha demostrado ser una línea de base o característica sólida en multitud de tareas», como las tareas de similitud de textos cortos.
* Ponderar los vectores de palabras con su TF-IDF para disminuir la influencia de las palabras más comunes.
* Concatenar los vectores de palabras.

Observar que la operación de sumar vectores ignora el orden de las palabras por lo que caemos en una representación tipo BOW.

Gensim permite obtener un promedio de vectores

In [None]:
doc_vectors = np.array([w2v_model.wv.get_mean_vector(doc) for doc in docs])
doc_vectors.shape

In [None]:
plt.figure()
plt.suptitle("Normas de los vectores de documentos")
plt.hist(np.linalg.norm(doc_vectors,axis=1))
plt.show()

In [None]:
np.save('wikipedia_w2v_mean_doc_vectors.npy',doc_vectors)

In [None]:
doc_vectors = np.load('wikipedia_w2v_mean_doc_vectors.npy')

In [None]:
for i,z in enumerate(doc_vectors):
    doc_vectors[i] = normalizar_vector(z)

In [None]:
#@title Grafiquemos la reducción de dimensionalidad 3d UMAP

# from umap import UMAP
# import matplotlib.pyplot as plt
# import plotly
# import plotly.graph_objs as go

# umap = UMAP(metric='cosine',n_components=3)
# X_umap = umap.fit_transform(doc_vectors)

# plotly.offline.init_notebook_mode()

# trace = go.Scatter3d(
#     x=X_umap[:,0],
#     y=X_umap[:,1],
#     z=X_umap[:,2],
#     mode='markers',
#     marker={
#         'size': 3,
#         'opacity': 0.75,
#         'color': 'blue'
#     },
#     hovertemplate='%{text}<extra></extra>',
#     text = [f"{docs_raw[j][:75]}" for j in range(X_umap.shape[0])]
# )

# layout = go.Layout(
#     margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
# )

# data = [trace]

# plot_figure = go.Figure(data=data, layout=layout)

# plot_figure.update_layout(
#     title = 'Wikipedia Docs',
#     scene = dict(
#         xaxis = dict(visible=False),
#         yaxis = dict(visible=False),
#         zaxis =dict(visible=False)
#         )
#     )

# plotly.offline.plot(plot_figure, filename='/content/drive/MyDrive/Colab Notebooks/NLP/Mi curso/wiki-w2v-norm-umap3d-docs.html')

# Un ejemplo de uso

## El corpus

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

### Embeddings como features para tareas de Machine Learning

In [None]:
!gdown 18kGdlhOiQNS61wUK7uPbdquKL3XJrgzf

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split

imdb_df = pd.read_csv('IMDB.csv')
display(imdb_df)

y = LabelEncoder().fit_transform(imdb_df['sentiment'].values)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(imdb_df['review'].values, y, test_size=0.2, random_state=642)

In [None]:
import numpy as np

random_idxs = np.random.choice(imdb_df.shape[0],5,replace=False)

for j in random_idxs:
    text = imdb_df.loc[j,'review']
    sentiment = imdb_df.loc[j,'sentiment']
    print(f"{text[:80]}...:\n\t{sentiment}")


El preprocesamiento y limpieza tarda alrededor de 2 minutos

In [None]:
X_train_raw = [re.sub(r'\d+', ' ', doc) for doc in X_train_raw]
train_tokenized_docs = [[x for x in word_tokenize(doc) if x not in stopwords and x not in punctuation]
                        for doc in X_train_raw]
train_docs = [' '.join(doc) for doc in train_tokenized_docs]

X_test_raw = [re.sub(r'\d+', ' ', doc) for doc in X_test_raw]
test_tokenized_docs = [[x for x in word_tokenize(doc) if x not in stopwords and x not in punctuation]
                       for doc in X_test_raw]
test_docs = [' '.join(doc) for doc in test_tokenized_docs]

In [None]:
vectorizer = TfidfVectorizer(max_features=2000, stop_words=stopwords)
X_train_tfidf = vectorizer.fit_transform(train_docs).toarray()
X_test_tfidf = vectorizer.transform(test_docs).toarray()

## El modelo de embeddings

Entrenemos un modelo de word2vec en el corpus IMDB. **Tarda alrededor de 2 minutos**

In [None]:
from gensim.models import Word2Vec

w2v_20ng_model = Word2Vec(sentences=train_tokenized_docs, vector_size=100, window=5, min_count=5, workers=4)

In [None]:
w2v_20ng_model.wv['ball']

In [None]:
w2v_20ng_model.wv.most_similar('ball',topn=10)

In [None]:
w2v_20ng_model.wv.most_similar('space',topn=10)

### Usando el promedio de embeddings de palabras

* IMDB: 35 minutos

In [None]:
train_doc_vectors = np.zeros((len(train_docs), w2v_20ng_model.wv.vector_size))
test_doc_vectors = np.zeros((len(test_docs), w2v_20ng_model.wv.vector_size))

for i, doc in enumerate(train_docs):
    words = [w for w in doc.split() if w in w2v_20ng_model.wv.index_to_key]
    if len(words) > 0:
        these_vectors = np.array([w2v_20ng_model.wv[w] for w in words])
        train_doc_vectors[i] = np.mean(these_vectors, axis=0)
print(train_doc_vectors[:3,:5])

for i, doc in enumerate(test_docs):
    words = [w for w in doc.split() if w in w2v_20ng_model.wv.index_to_key]
    if len(words) > 0:
        these_vectors = np.array([w2v_20ng_model.wv[w] for w in words])
        test_doc_vectors[i] = np.mean(these_vectors, axis=0)

In [None]:
np.save('IMDB_w2v_train_doc_vectors.npy',train_doc_vectors)
np.save('IMDB_w2v_test_doc_vectors.npy',test_doc_vectors)

In [None]:
!gdown 1DUJuV6Dl5-eOj6VRgZ-Q27mLk9Gj0GLS
!gdown 1lO4I-RiB2Xd6wQmMH7Q5P0qAFMxUrcMW

train_doc_vectors = np.load('IMDB_w2v_train_doc_vectors.npy')
test_doc_vectors = np.load('IMDB_w2v_test_doc_vectors.npy')

Veamos cuántos vectores nulos hay, **¿por qué podría ocurrir esto?**

In [None]:
print(f"Vectores nulos en el conjunto train: {np.where(np.sum(np.abs(train_doc_vectors),axis=1)==0)[0].shape[0]}")
print(f"Vectores nulos en el conjunto test: {np.where(np.sum(np.abs(test_doc_vectors),axis=1)==0)[0].shape[0]}")

In [None]:
plt.figure()
plt.suptitle("Normas de los vectores de documentos")
plt.hist(np.linalg.norm(train_doc_vectors,axis=1))
plt.show()

Probar normalizando y sin normalizar.

Entrenamos un modelo de machine learning para la tarea de clasificación usando los embeddings. Evaluamos usando el accuracy.

IMDB: Sin normalizar $\approx$ 82%-85%

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

clfs = [SVC(), LogisticRegression(),RandomForestClassifier(),
        MLPClassifier(hidden_layer_sizes=(50,50))]

for clf in clfs:
    clf.fit(train_doc_vectors, y_train)
    print(clf.score(test_doc_vectors, y_test))

### ⚡ Usando el promedio pesado con tf-idf de palabras

Esta es una técnica híbrida para representar documentos, consiste en re-escalar los vectores de las palabras usando como pesos los coeficientes TF-IDF

Consideremos ambos vocabularios

In [None]:
vocabulary_w2v = w2v_20ng_model.wv.index_to_key
print(len(vocabulary_w2v))
print(vocabulary_w2v[:20])

vocabulary_tfidf = vectorizer.get_feature_names_out()
print(len(vocabulary_tfidf))
print(vocabulary_tfidf[:20])

In [None]:
train_doc_vectors = np.zeros((len(train_docs), w2v_20ng_model.wv.vector_size))
for i, doc in enumerate(train_docs):
    words = [w for w in doc.split() if w in vocabulary_w2v and w in vocabulary_tfidf]
    if len(words) > 0:
        these_weights = np.array([X_train_tfidf[i,vectorizer.vocabulary_[w]] for w in words])
        these_vectors = np.array([w2v_20ng_model.wv[w] for w in words])
        train_doc_vectors[i] = np.sum(these_vectors * these_weights.reshape(-1,1), axis=0)

test_doc_vectors = np.zeros((len(test_docs), w2v_20ng_model.wv.vector_size))
for i, doc in enumerate(test_docs):
    words = [w for w in doc.split() if w in vocabulary_w2v and w in vocabulary_tfidf]
    if len(words) > 0:
        these_weights = np.array([X_test_tfidf[i,vectorizer.vocabulary_[w]] for w in words])
        these_vectors = np.array([w2v_20ng_model.wv[w] for w in words])
        test_doc_vectors[i] = np.sum(these_vectors * these_weights.reshape(-1,1), axis=0)

In [None]:
np.save('imdb_w2v_tfidf_train_doc_vectors.npy',train_doc_vectors)
np.save('imdb_w2v_tfidf_test_doc_vectors.npy',test_doc_vectors)

In [None]:
np.save('/content/drive/MyDrive/Colab Notebooks/imdb_w2v_tfidf_train_doc_vectors.npy',train_doc_vectors)
np.save('/content/drive/MyDrive/Colab Notebooks/imdb_w2v_tfidf_test_doc_vectors.npy',test_doc_vectors)

In [None]:
print(f"Vectores nulos en el conjunto train: {np.where(np.sum(np.abs(train_doc_vectors),axis=1)==0)[0].shape[0]}")
print(f"Vectores nulos en el conjunto test: {np.where(np.sum(np.abs(test_doc_vectors),axis=1)==0)[0].shape[0]}")

In [None]:
plt.figure()
plt.suptitle("Normas de los vectores de documentos")
plt.hist(np.linalg.norm(train_doc_vectors,axis=1))
plt.show()

In [None]:
for i,v in enumerate(train_doc_vectors):
    train_doc_vectors[i] = normalizar_vector(v)

for i,v in enumerate(test_doc_vectors):
    test_doc_vectors[i] = normalizar_vector(v)

plt.figure()
plt.suptitle("Normas de los vectores de documentos")
plt.hist(np.linalg.norm(train_doc_vectors,axis=1))
plt.show()

* IMDB: Sin normalizar, 79-82%. Normalizando, 81-82%

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

clfs = [SVC(), LogisticRegression(),
        RandomForestClassifier(),
        MLPClassifier(hidden_layer_sizes=(50,50))]

for clf in clfs:
    clf.fit(train_doc_vectors, y_train)
    print(clf.score(test_doc_vectors, y_test))