 **Représentation vectorielle denses (les plongements de mots)**

 

Nous allons voir comment représenter un document avec des embeddings pre-entrainés. Nous utilisons pour cela les représentations word2Vec telles qu'implémenter dans Spacy puis dans la librairie Gensim.


In [24]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pprint
import re
from gensim.models import word2vec
from sklearn.manifold import TSNE

In [9]:
import gensim
import gensim.downloader
 
for model_name in list(gensim.downloader.info()['models'].keys()):
  print(model_name)

google_news_vectors = gensim.downloader.load('word2vec-google-news-300')   #3 million 300-dimension word vectors)


fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [None]:
google_news_vectors['paris']

**Les similarités**

In [None]:
capital = google_news_vectors.most_similar(["Paris", "Britain"], ["France"])#, topn=1)
print(capital)

In [None]:
sim_words = google_news_vectors.most_similar('intelligence')
print (sim_words)

In [None]:
most_similar=google_news_vectors.most_similar(positive=['girl'])

less_similar=google_news_vectors.most_similar(negative=['girl'])

print(most_similar)
print(less_similar)

**Affichage des vecteurs en 2D grâce à TSNE**




In [None]:
def affichage_similarite_2D(model, word):
  arr = np.empty((0,300), dtype='f')
  word_labels = [word]
  
  # get close words
  close_words = model.wv.similar_by_word(word)
  
  # add the vector for each of the closest words to the array
  arr = np.append(arr, np.array([model[word]]), axis=0)
  
  for wrd_score in close_words:
    wrd_vector = model[wrd_score[0]]
    word_labels.append(wrd_score[0])
    arr = np.append(arr, np.array([wrd_vector]), axis=0)
  
  # find tsne coords for 2 dimensions
  
  tsne = TSNE(n_components=2, random_state=0)
  np.set_printoptions(suppress=True)
  Y = tsne.fit_transform(arr)
  
  x_coords = Y[:, 0]
  y_coords = Y[:, 1]
  
  # display scatter plot
  plt.scatter(x_coords, y_coords)
  
  for label, x, y in zip(word_labels, x_coords, y_coords):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
  plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
  plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
  plt.show()

 

affichage_similarite_2D(google_news_vectors, 'girl')



**Les analogies (cf. slides 55 à 59)**

In [None]:
def pp(obj):
    print(pd.DataFrame(obj))
    
def analogy(worda, wordb, wordc):
    result = google_news_vectors.most_similar(negative=[worda], 
                                positive=[wordb, wordc])
    return result[0][0]

countries = ['australia', 'canada', 'germany', 'ireland', 'italy']

#ce qui revient à "us - country + hamburger, i.e. us is to hamburger as sim(hamburger) is to country"

foods = [analogy('us', 'hamburger', country) for country in countries]
pp(zip(countries,foods))




In [None]:
#on plot
from sklearn.decomposition import PCA


def plot_data(orig_data, labels):
    pca = PCA(n_components=2)
    data = pca.fit_transform(orig_data)
    plt.figure(figsize=(7, 5), dpi=100)
    plt.plot(data[:,0], data[:,1], '.')
    for i in range(len(data)):
        plt.annotate(labels[i], xy = data[i])
    for i in range(len(data)//2):
        plt.annotate("",
                xy=data[i],
                xytext=data[i+len(data)//2],
                arrowprops=dict(arrowstyle="->",
                                connectionstyle="arc3")
        )
       
labels = countries + foods
data = [google_news_vectors[w] for w in labels]
plot_data(data, labels)

In [None]:
#try other biased analogies....
#try with the analogies of the form a is to b as c is to d


**Création de vecteurs à partir d'un corpus de textes**

In [30]:
from sklearn.datasets import fetch_20newsgroups
import re


train = fetch_20newsgroups(subset='train')



In [None]:
def clean(text):
    """tokenization, ne garder que les lettres, suppression des headers, etc. """
    lines = re.split('[?!.:]\s', re.sub('^.*Lines: \d+', '', re.sub('\n', ' ', text)))
    return [re.sub('[^a-zA-Z]', ' ', line).lower().split() for line in lines]

sentences = [line for text in train.data for line in clean(text)]

print('raw data:\n\n', train.data[0])
print('example input:\n', sentences[0])

model = Word2Vec(sentences, workers=4, size=300, min_count=50, window=10, sample=1e-3)  # voir la documentation de Word2Vec

affichage_similarite_2D(model, 'sport')

