# Lab 05

Student: John Wu

In [None]:
import nltk, sys, gensim, unicodedata
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

## Analysis of small.txt file

The file contains sentences delimited by line breaks. However, the sentences need to be tokenized first before it can be processed. To do this, we use the [Tok-Tok](https://github.com/jonsafari/tok-tok) tokenizer, which is fast (though simple) tokenizer of sentences that work on different Romance languages.

Since we need to remove puncutations as well, a function is implemented to check if the token is of length 1, and if it is, the [unicode category](https://unicodebook.readthedocs.io/unicode.html#categories) of the token is of punctuation or symbol (or `P` and `S`).

In [None]:
punctCatgs = {'P', 'S'}
def isPunct(tkn): # if length=1 and in unicode punctuation and symbol cateogry
    return len(tkn) == 1 and unicodedata.category(tkn)[0] in punctCatgs

toktok = nltk.toktok.ToktokTokenizer() # instantiate TokTok tokenizer

def tokSentNoPunc(txt): # sentence tokenizer
    return [t for t in toktok.tokenize(txt) if not isPunct(t)]

The file is read in as unicode since there are presence of non-ASCII punctuations and letters. The code below reads the file and print out a sample of the tokenized result.

In [None]:
def readFileAndTokenize(filePath):
    with open(filePath, 'r', encoding='utf-8') as f:    
        return [tokSentNoPunc(l.casefold()) for l in f] # read lines and tokenize

sents = readFileAndTokenize('./data/small.txt')
print(sents[100:104])

__Build a word2vec model__

Using the gensim library, a 100-dimension word2vec model is built and word embeddings calculated.

In [None]:
mdl = Word2Vec(sents, size=100, window=5, min_count=2, sg=1, negative=8)

To create a visualization of embedding vectors, we get the first two principal components of several terms.

In [None]:
wrds = ['paris', 'istanbul', 'moscow', 'france', 'turkey', 'russia', 'cat', 
        'dog', 'truck', 'train', 'two', 'three', 'four']
v = np.vstack([mdl.wv[s] for s in wrds]).T # stack vectors from all words
pc2 = PCA(n_components=2) # model for PC1 and PC2
pc2.fit(v); # get the PCs

We plot the PC1 on x-axis, PC2 on y-axis, along with text annotation of the data points.

In [None]:
plt.scatter(pc2.components_[0], pc2.components_[1])
for n,w in enumerate(wrds):
    plt.text(pc2.components_[0,n], pc2.components_[1,n], w)
plt.show()

___PUT IN ANALYSIS HERE___

## Google News Pre-trained Embeddings

In [None]:
googFile = './data/GoogleNews-vectors-negative300.bin'
googMdl = KeyedVectors.load_word2vec_format(googFile, binary=True)

In [None]:
googMdl.most_similar('fascinating')

In [None]:
googMdl.most_similar('cultivate')

In [None]:
googMdl.distances('Vietnam', ['Spain', 'China', 'Egypt'])

In [None]:
googMdl.distances('mother', ['father', 'teacher', 'ocean'])

In [None]:
googMdl.most_similar_cosmul(['puppies', 'cat'], ['dog'], 1)

In [None]:
googMdl.most_similar_cosmul(['read', 'music'], ['book'], 1)

In [None]:
googMdl.most_similar_cosmul(['hot', 'winter'], ['summer'], 1)

In [None]:
googMdl.distances('small', ['tiny', 'large', 'ice'])

In [None]:
del googMdl

## Word Embedding for Translations

Reading in text files for both languages

In [None]:
engSents = readFileAndTokenize('./data/eng.txt')
spaSents = readFileAndTokenize('./data/spa.txt')

Training models and saving results

In [None]:
engModel = Word2Vec(engSents, size=100, window=5, min_count=2, sg=1, negative=8)
spaModel = Word2Vec(spaSents, size=100, window=5, min_count=2, sg=1, negative=8)

In [None]:
engModel.wv.save_word2vec_format('eng.w2v.model')
spaModel.wv.save_word2vec_format('spa.w2v.model')

Run vecmap code to learn bilingual projections

In [None]:
%run ./vecmap/map_embeddings.py --supervised data/es-en.train.txt spa.w2v.model eng.w2v.model \
    spa_mapped.emb eng_mapped.emb 

Load the bilingual embeddings

In [None]:
spa2eng = gensim.models.KeyedVectors.load_word2vec_format('spa_mapped.emb')
eng2spa = gensim.models.KeyedVectors.load_word2vec_format('eng_mapped.emb') 

Test out the various translations

In [None]:
def printSpaToEng(words):
    if isinstance(words, str):
        words = [words]
    for w in words:
        trans = eng2spa.similar_by_vector(spa2eng[w])[0]
        print("Sp: %s = En: %s, (%f)" % (w, trans[0], trans[1]))

In [None]:
l = ['playa', 'villa', 'perros', 'naufragio', 'islas', 'cantar', 
     'calles', 'naranjas', 'bomberos', 'escalera', 'nadó','frontera',
     'pasaporte', 'fábrica']
printSpaToEng(l)

In [None]:
l2 = ['jugar', 'juego', 'juegas', 'juega', 'jugamos', 'juegan']
printSpaToEng(l2)

In [None]:
l3 = ['europeo', 'español', 'cubano', 'ecuatoriano', 'francés', 'alemán', 
      'chino', 'japonés', 'americano', 'estadounidense',  'egipcio',
      'turco', 'nigeriano']
printSpaToEng(l3)