# Word Embeddings

## Let's train a simple Word2Vec model

In [75]:
!sudo apt update && sudo apt install -y gcc g++

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease3m
Hit:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Reading package lists... Done[0m[33m
Building dependency tree       
Reading state information... Done
56 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
g++ is already the newest version (4:7.4.0-1ubuntu2.3).
gcc is already the newest version (4:7.4.0-1ubuntu2.3).
0 upgraded, 0 newly installed, 0 to remove and 56 not upgraded.


In [76]:
!pip install gensim sklearn bs4 pandas matplotlib fasttext pandas



In [77]:
from IPython.display import display
import pandas

lines = []
with open("data.txt") as f:
    lines = [line.split("\t")[1] for line in f.readlines()]
pandas.DataFrame(lines[:3], columns=["line"])

Unnamed: 0,line
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...\n
2,Free entry in 2 a wkly comp to win FA Cup fina...


Tokenize the lines

In [78]:
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords, preprocess_documents


sentences = preprocess_documents(lines)
pandas.DataFrame(zip(lines[:3], sentences[:3]), columns=["line", "tokens"])

Unnamed: 0,line,tokens
0,"Go until jurong point, crazy.. Available only ...","[jurong, point, crazi, avail, bugi, great, wor..."
1,Ok lar... Joking wif u oni...\n,"[lar, joke, wif, oni]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, wkly, comp, win, cup, final, tkt..."


Train the model

In [79]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences=sentences, min_count=2)

What does the vocabulary look like?

In [80]:
pandas.DataFrame(w2v_model.wv.index_to_key[:5], columns=["term"])

Unnamed: 0,term
0,come
1,dai
2,free
3,know
4,love


Let's see some similarities

In [81]:
from gensim.parsing.preprocessing import preprocess_string

pandas.DataFrame(w2v_model.wv.most_similar(preprocess_string("nokia"), topn=10), columns=["term", "similarity"])

Unnamed: 0,term,similarity
0,txt,0.999484
1,week,0.999467
2,mobil,0.999453
3,free,0.999399
4,tone,0.999377
5,repli,0.999301
6,www,0.999225
7,text,0.999203
8,com,0.999185
9,msg,0.99916


Word2Vec cannot handle unknown words

In [82]:
try:
    w2v_model.wv.similar_by_word("blubbergurken")
except KeyError as e:
    display(e)

KeyError("Key 'blubbergurken' not present")

## fastText

In [None]:
from gensim.models import FastText
from gensim.utils import tokenize
from gensim.parsing.preprocessing import preprocess_string
ft_model = FastText()
corpus = [list(tokenize(line, lowercase=True, deacc=True)) for line in lines]
ft_model.build_vocab(corpus_iterable=corpus)
ft_model.train(corpus_iterable=corpus, total_examples=len(corpus), epochs=100)

In [None]:
pandas.DataFrame(ft_model.wv.most_similar("nokia", topn=10), columns=["term", "similarity"])

## Can also be used for classification

In [None]:
with open("data.txt") as f:
    lines = [next(f) for line in range(5)]
pandas.DataFrame(lines, columns=["line"])

In [None]:
import fasttext

model = fasttext.train_supervised("data.txt")

In [None]:
model.test("data.txt")

In [None]:
prediction = model.predict("Congratulations YOU'VE Won. You're a Winner in our August £1000 Prize Draw. Call 09066660100 NOW. Prize Code 2309.")
pandas.DataFrame([[x[0] for x in prediction]], columns=["label", "confidence"])

## A more complex model

Gensim provides a lot of pretrained models

In [None]:
import gensim.downloader
pandas.DataFrame(list(gensim.downloader.info()['models'].keys()), columns=["model"])

In [None]:
wiki_model = gensim.downloader.load('glove-wiki-gigaword-50')

In [None]:
pandas.DataFrame(wiki_model.most_similar('twitter'), columns=["term", "similarity"])

In [None]:
pandas.DataFrame(wiki_model.most_similar(positive=['woman', 'king'], negative=['man']), columns=["term", "similarity"])

In [None]:
wiki_model.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
wiki_model.similarity('woman', 'man')

In [None]:
pandas.DataFrame(wiki_model.similar_by_word("cat"), columns=["term", "similarity"])

In [None]:
from gensim.parsing.preprocessing import preprocess_string

words = ["machine", "learning", "information", "retrieval", "computer", "science"]
semantically_similar_words = {word: [item[0] for item in wiki_model.most_similar([word], topn=5)]
                  for word in words}
pandas.DataFrame([[term, similars] for (term, similars) in semantically_similar_words.items()], columns=["term", "similar terms"])

In [None]:
from sklearn.decomposition import PCA

all_similar_words = sum([[k] + v for k, v in semantically_similar_words.items()], [])

pandas.DataFrame(all_similar_words, columns=["term"])

Map the embedding vectors on a 2d space

In [None]:
import matplotlib.pyplot as plt

word_vectors = wiki_model[all_similar_words]

pca = PCA(n_components=2)

p_comps = pca.fit_transform(word_vectors)
word_names = all_similar_words

plt.figure(figsize=(18, 10))
plt.scatter(p_comps[:, 0], p_comps[:, 1], c='red')

for word_names, x, y in zip(word_names, p_comps[:, 0], p_comps[:, 1]):
    plt.annotate(word_names, xy=(x+0.06, y+0.03), xytext=(0, 0), textcoords='offset points')