In [0]:
%matplotlib inline 
%load_ext autoreload
%autoreload 2
from collections import defaultdict
import re
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd   
from gensim import corpora, models, similarities, matutils
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.matutils import cossim
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pickle
import random
import sys
from tqdm import tqdm
from sklearn import manifold
import tensorflow_datasets as tfds


# Word2vec tutorial
## Corpus Cleaning

In [0]:
# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

(train_data, validation_data), test_data = tfds.load(
    name="imdb_reviews", 
    split=(train_validation_split, tfds.Split.TEST),
    as_supervised=True)

df_train = pd.DataFrame(list(tfds.as_numpy(train_data)),columns=['texto','clase'])
df_dev = pd.DataFrame(list(tfds.as_numpy(validation_data)),columns=['texto','clase'])
df_test = pd.DataFrame(list(tfds.as_numpy(test_data)),columns=['texto','clase'])

In [0]:
df_train.texto[0]

In [0]:
trainset = []
for rev in df_train.texto:
  rev = rev.decode('utf-8').lower()
  # clean review
  rev = re.sub('<.*?>',' ',rev)
  #split by sentence
  sentences = sent_tokenize(rev)
  for sent in sentences:
    # word tokenize and append the sentence as a list of words
    trainset.append([word for word in word_tokenize(sent) if word.isalpha()])

In [0]:
trainset[:3]

In [0]:
print("el corpus tiene",len(trainset), "oraciones y",sum([len(x) for x in trainset]),"palabras"   )

In [0]:
collocations = Phrases(sentences=trainset, min_count=10,threshold=0.5,scoring='npmi') # threshold: minimo score aceptado

In [0]:
to_collocations = Phraser(collocations)

In [0]:
df_collocations =pd.DataFrame([x for x in collocations.export_phrases(trainset)],columns=["bigram","score"])
df_collocations.shape

In [0]:
df_collocations.drop_duplicates().sort_values(by="score",ascending=False).head(50)

In [0]:
trainset_ngrams = to_collocations[trainset]

In [0]:
trainset_ngrams

In [0]:
# "window" es el tamaño de la ventana. windows = 10, usa 10 palabras a la izquierda y 10 palabras a la derecha
# "n_dim" es la dimension (i.e. el largo) de los vectores de word2vec
# "workers" es el numero de cores que usa en paralelo. Para aprobechar eso es necesario tener instalado Cython)
# "sample": word2vec filtra palabras que aparecen una fraccion mayor que "sample"
# "min_count": Word2vec filtra palabras con menos apariciones que  "min_count"
# "sg": para correr el Skipgram model (sg = 1), para correr el CBOW (sg = 0)
# para mas detalle ver: https://radimrehurek.com/gensim/models/word2vec.html

w2v_model = Word2Vec(trainset_ngrams, workers=4,size=20, min_count = 10, window = 10, sample = 1e-3,negative=5,sg=1)
#w2v_model.save("word2vec_20dim")  # save model
#w2v_model = Word2Vec.load("word2vec_20dim")  # load model

In [0]:
# the output of a word2vec representation is a numpy array 
w2v_model.wv["awesome"]

In [0]:
w2v_model.wv.n_similarity(["jim_carrey"], ["silly"])

In [0]:
print ("morgan_freeman-god similarity:",w2v_model.wv.n_similarity(["morgan_freeman"], ["god"]))
print ("jim_carrey-god similarity:",w2v_model.wv.n_similarity(["jim_carrey"], ["god"]) )

print ("\n")

print ("morgan_freeman-silly similarity:",w2v_model.wv.n_similarity(["morgan_freeman"], ["silly"]))
print ("jim_carrey-silly similarity:",w2v_model.wv.n_similarity(["jim_carrey"], ["silly"]) )
print ("\n")


In [0]:
w2v_model.most_similar(positive=["jim_carrey"], negative=[], topn=25)

In [0]:
target_word="funny"
actores = ["adam_sandler","jim_carrey","ben_stiller","eddie_murphy","mike_myers","chris_rock","stallone","willis","jackie_chan",'jet_li',"van_damme","harrison_ford","schwarzenegger","chuck_norris",'dicaprio','viggo_mortensen']
fun_score = []
for actor in actores:
    fun_score.append(w2v_model.wv.n_similarity([target_word], [actor]))
    
pd.DataFrame(fun_score,index = actores,columns=[target_word]).sort_values(by=target_word).plot(kind="bar",figsize=(15,5), fontsize=20)

In [0]:
p_movies = ["horror","comedy","drama","science_fiction","western","documentary"]
p_actors_action = ["stallone","willis","jackie_chan",'jet_li',"van_damme","harrison_ford","schwarzenegger","chuck_norris"]
p_actors_comedy = ["adam_sandler","jim_carrey","ben_stiller","eddie_murphy","mike_myers","chris_rock"]
p_superheroes = ["batman","superman","spiderman","robocop","hulk","wolverine"]
p_colors = ["blue", "green", "yellow", "red", "orange"] # p_robos_en
palabras = p_movies + p_actors_action + p_actors_comedy + p_superheroes + p_colors
colores = ["black"]*len(p_movies)+["blue"]*len(p_actors_action)+["green"]*len(p_actors_comedy)+["red"]*len(p_superheroes) +["purple"]*len(p_colors)

In [0]:
# Armo una matriz de distancias
distancias=np.zeros((len(palabras),len(palabras))) #matriz cuadrada
for i,ti in enumerate(palabras):
    for j,tj in enumerate(palabras):
        distancias[i,j] = abs(1-w2v_model.wv.similarity(ti,tj))
print (distancias.shape)
distancias

In [0]:
# Reduccion de la dimensionalidad y visualizacion 
from sklearn.manifold import MDS
from sklearn.manifold import TSNE 
def visualize_embeddings(distancias,palabras,colores,perplexity):
    plt.figure(figsize=(20,10))
    # Reduccion de la dimensionalidad y visualizacion 
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=123,
                       dissimilarity="precomputed", n_jobs=4)
    Y = mds.fit(distancias).embedding_
    plt.subplot(1,2,1)
    plt.scatter(Y[:, 0], Y[:, 1],color="black",s=3)
    for label, x, y, color in zip(palabras, Y[:, 0], Y[:, 1],colores):
        plt.annotate(label, xy=(x, y), xytext=(0, 0),color=color, textcoords='offset points',size=13)
    plt.title("MDS")
    # Reduccion de la dimensionalidad y visualizacion 
    tsne = TSNE(n_components=2,metric="precomputed",learning_rate=1000, random_state=123,perplexity=perplexity)
    np.set_printoptions(suppress=True)
    plt.subplot(1,2,2)
    Y = tsne.fit_transform(distancias)
    plt.scatter(Y[:, 0], Y[:, 1],color="black",s=3)
    for label, x, y, color in zip(palabras, Y[:, 0], Y[:, 1],colores):
        plt.annotate(label, xy=(x, y), xytext=(0, 0),color=color, textcoords='offset points',size=13)
    plt.title("TSNE")

In [0]:
visualize_embeddings(distancias,palabras,colores,perplexity=4)

# Latent Semantic Analisis (LSA)

In [0]:
# Creates a dictionary wich maps tokens with Ids
dictionary = corpora.Dictionary(trainset)
# filter words with low frequency
dictionary.filter_extremes(no_below=10, no_above=1, keep_n=100000)

In [0]:
list(dictionary.iteritems())[:15]

In [0]:
# me quedo con todos los tokens menos los "bad_ids" o sino con todos los "good_ids
# stopwords_id =np.array(dictionary.doc2bow(stopwords.words('english')))[:,0]
# dictionary.filter_tokens(bad_ids=stopwords_id, good_ids=None) 
# dictionary.save("diarios_dictionary.dict")

In [0]:
corpus = [dictionary.doc2bow(line) for line in trainset]
tfidf = models.TfidfModel(corpus)  # tf-idf  transformation
corpus_tfidf = tfidf[corpus]

In [0]:
corpus_tfidf

In [0]:
n_topics = 100
lsa_tfidf = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics = n_topics)  # initialize an LSI transformation
#lsa_tfidf.save("LSA_movies.lsi") # Save LSA
#lsa_tfidf = models.LsiModel.load("LSA_movies.lsi")

In [0]:
# vectorial representation of a token
vect_zombie = lsa_tfidf[dictionary.doc2bow(["zombie"])] 
vect_zombie[:10] # shows only the first 10 elements


In [0]:
# show the components of topic 2
lsa_tfidf.show_topic(60, topn=15)

In [0]:
# Armo una matriz de distancias
distancias_lsa=np.zeros((len(palabras),len(palabras))) #matriz cuadrada
for i,ti in enumerate(palabras):
    for j,tj in enumerate(palabras):
        distancias_lsa[i,j] = abs(1-cossim(lsa_tfidf[dictionary.doc2bow([ti])] ,lsa_tfidf[dictionary.doc2bow([tj])]))
print( distancias_lsa.shape )


In [0]:
visualize_embeddings(distancias_lsa,palabras,colores,perplexity=5)

# Pretrained Word-embedding

In [0]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [0]:
w2v_model2 = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [0]:
p_robos_en = ["robberies", "weapons", "murder", "thieves", "robbery", "assault"]
p_ciencias_en = ["biology", "chemistry", "mathematics", "philosophy", "psychology", "science", "engineering"]
p_tiempo_en = ["rainy", "sunny", "heat", "cloudy", "snow", "storm"]
p_paises_en = ["Switzerland", "Sweden", "France", "Netherlands", "Peru", "Bolivia", "Paraguay", "Uruguay","Argentine" ,"Brazil", "Colombia"]
p_comida_en = ["bread", "noodles", "cookies", "cheese", "pizza", "beer", "wine"]
p_tecno_en = ["technology", "computer", "internet", "web", "hackers", "monitor", "mouse"]
p_hogar_en = ["kitchen", "bathroom", "dining_room", "armchairs", "wardrobe", "chairs", "tables", "tableware"]
grupos = [p_robos_en,p_ciencias_en,p_tiempo_en,p_paises_en,p_comida_en,p_tecno_en,p_hogar_en]
palabras = p_robos_en + p_ciencias_en + p_tiempo_en + p_paises_en + p_comida_en + p_tecno_en +p_hogar_en
colores = ["black"]*len(p_robos_en)+["blue"]*len(p_ciencias_en)+["green"]*len(p_tiempo_en)+["red"]*len(p_paises_en) +["purple"]*len(p_comida_en)+["orange"]*len(p_tecno_en)+["cyan"]*len(p_hogar_en) 

In [0]:
# Armo una matriz de distancias
distancias_pre=np.zeros((len(palabras),len(palabras))) #matriz cuadrada
for i,ti in enumerate(palabras):
    for j,tj in enumerate(palabras):
        distancias_pre[i,j] = abs(1-w2v_model2.wv.similarity(ti,tj))
print (distancias_pre.shape)

In [0]:
visualize_embeddings(distancias_pre,palabras,colores,perplexity=4)

In [0]:
print ("woman-kitchen similarity:",w2v_model2.wv.n_similarity(["woman"], ["kitchen"]))
print ("man-kitchen similarity:",w2v_model2.wv.n_similarity(["man"], ["kitchen"]) )
print ("\n")
print ("woman-wife similarity:",w2v_model2.wv.n_similarity(["woman"], ["wife"]) )
print ("man-husband similarity:",w2v_model2.wv.n_similarity(["man"], ["husband"]) )
print("\n")
print ("woman-children similarity:",w2v_model2.wv.n_similarity(["woman"], ["children"]) )
print ("man-children similarity:",w2v_model2.wv.n_similarity(["man"], ["children"]) )
print("\n")

# Ganando un poco de intuición

In [0]:
print ("bank-money similarity:",w2v_model2.wv.n_similarity(["bank"], ["money"]) )
print ("bank-park similarity:",w2v_model2.wv.n_similarity(["bank"], ["park"]) )

In [0]:
print(w2v_model2.n_similarity(["Argentina"], ["park"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["Argentina"], ["money"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["run"], ["monkey"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["wizard"], ["happy"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["money"], ["wealthy"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["money"], ["bankrupt"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["good"], ["happy"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["good"], ["bad"]),'\n')

In [0]:
print(w2v_model2.n_similarity(["black"], ["white"]),'\n')

# volviendo a los reviews

In [0]:
w2v_model.wv.n_similarity(["black"],["white"])

In [0]:
w2v_model.wv.n_similarity(["hand"],["orange"])