In [1]:
import pdb
import pickle
import string
import sys
import time

import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import scipy
import sklearn
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

sys.path.append('..')
from utils import cosine_similarity, get_dict, process_tweet

# 1. The word embeddings data for English and French words
Write a program that translates English to French.

- Eng embeddings: https://code.google.com/archive/p/word2vec/ (GoogleNews-vectors-negative300.bin.gz)
- Fr embeddings:
```
curl -o ./wiki.multi.fr.vec https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec
```

In [2]:
DATA = '../../../../data'

In [3]:
en_embeddings_subset = pickle.load(
    open(f'{DATA}/en_embeddings.pkl', 'rb'))
fr_embeddings_subset = pickle.load(
    open(f'{DATA}/fr_embeddings.pkl', 'rb'))

In [4]:
type(en_embeddings_subset)

dict

In [5]:
print(len(en_embeddings_subset['the']))
en_embeddings_subset['the'][:10]

300


array([ 0.08007812,  0.10498047,  0.04980469,  0.0534668 , -0.06738281,
       -0.12060547,  0.03515625, -0.11865234,  0.04394531,  0.03015137],
      dtype=float32)

In [6]:
# loading the english to french dictionaries
en_fr_train = get_dict(f'{DATA}/en-fr.train.txt')
print('The length of the English to French training dictionary is', 
      len(en_fr_train))
en_fr_test = get_dict(f'{DATA}/en-fr.test.txt')
print('The length of the English to French test dictionary is', 
      len(en_fr_train))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 5000


In [7]:
type(en_fr_train)

dict

In [8]:
en_fr_train['dog']

'chienne'

## 1.1 Generate embedding and transform matrices
#### Exercise 01: Translating English dictionary to French by using embeddings

implement a function `get_matrices`, which takes the loaded data
and returns matrices `X` and `Y`.

Inputs:
- `en_fr` : English to French dictionary
- `en_embeddings` : English to embeddings dictionary
- `fr_embeddings` : French to embeddings dictionary

Returns:
- Matrix `X` and matrix `Y`, where each row in X is the word embedding for an
english word, and the same row in Y is the word embedding for the French
version of that English word.

Use the `en_fr` dictionary to ensure that the ith row in the `X` matrix
corresponds to the ith row in the `Y` matrix.

In [9]:
def get_matrices(en_fr, french_vecs, english_vecs):
    """
    Input:
        en_fr: English to French dictionary
        french_vecs: French words to their corresponding word embeddings.
        english_vecs: English words to their corresponding word 
          embeddings.
    Output: 
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the projection matrix that minimizes the F norm ||X R -Y||^2.
    """
    X = []
    Y = []
    english_set = set(english_vecs.keys())
    french_set = set(french_vecs.keys())
    french_words = set(en_fr.values())
    for en_word, fr_word in en_fr.items():
        if fr_word in french_set and en_word in english_set:
            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]
            X.append(en_vec)
            Y.append(fr_vec)
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

In [10]:
X_train, Y_train = get_matrices(
    en_fr_train, fr_embeddings_subset, en_embeddings_subset)

# 2. Translations

## 2.1 Translation as linear transformation of embeddings

Given dictionaries of English and French word embeddings, create a transformation matrix `R`
* Given an English word embedding, **e**, you can multiply **eR** to get a new word embedding **f**.
    * Both **e** and **f** are [row vectors](https://en.wikipedia.org/wiki/Row_and_column_vectors).
* You can then compute the nearest neighbors to `f` in the french embeddings and recommend the word that is most similar to the transformed word embedding.

### Describing translation as the minimization problem

Find a matrix `R` that minimizes the following equation. 

$$\arg \min _{\mathbf{R}}\| \mathbf{X R} - \mathbf{Y}\|_{F}\tag{1} $$

### Frobenius norm

The Frobenius norm of a matrix $A$ (assuming it is of dimension $m,n$) is defined as the square root of the sum of the absolute squares of its elements:

$$\|\mathbf{A}\|_{F} \equiv \sqrt{\sum_{i=1}^{m} \sum_{j=1}^{n}\left|a_{i j}\right|^{2}}\tag{2}$$

### Actual loss function
In the real world applications, the Frobenius norm loss:

$$\| \mathbf{XR} - \mathbf{Y}\|_{F}$$

is often replaced by it's squared value divided by $m$:

$$ \frac{1}{m} \|  \mathbf{X R} - \mathbf{Y} \|_{F}^{2}$$

where $m$ is the number of examples (rows in $\mathbf{X}$).

* The same R is found when using this loss function versus the original Frobenius norm.
* The reason for taking the square is that it's easier to compute the gradient of the squared Frobenius.
* The reason for dividing by $m$ is that we're more interested in the average loss per embedding than the  loss for the entire training set.
    * The loss for all training set increases with more words (training examples),
    so taking the average helps us to track the average loss regardless of the size of the training set.

#### Step 1: Computing the loss
* The loss function will be squared Frobenoius norm of the difference between
matrix and its approximation, divided by the number of training examples $m$.
* Its formula is:
$$ L(X, Y, R)=\frac{1}{m}\sum_{i=1}^{m} \sum_{j=1}^{n}\left( a_{i j} \right)^{2}$$

where $a_{i j}$ is value in $i$th row and $j$th column of the matrix $\mathbf{XR}-\mathbf{Y}$.

In [11]:
def compute_loss(X, Y, R):
    '''
    Inputs: 
        X: a matrix of dimension (m, n) where the columns are the English 
          embeddings.
        Y: a matrix of dimension (m, n) where the columns corresponding to 
          the French embeddings.
        R: a matrix of dimension (n, n) - transformation matrix from 
          English to French vector space embeddings.
    Outputs:
        L: a matrix of dimension (m, n) - the value of the loss function 
          for given X, Y and R.
    '''
    m = X.shape[0]
    err = X @ R - Y
    loss = (err ** 2).sum() / m
    return loss

In [12]:
X = np.array([[0, 1, 2],
              [1, 2, 3],
              [2, 3, 4],
              [3, 4, 5]])
Y = np.array([[0, 1, 3],
              [1, 3, 5],
              [2, 3, 5],
              [3, 5, 7]])
R = np.array([[1, 0, 0],
              [0, 1, 0],
              [0, 0, 1]])

In [13]:
compute_loss(X, Y, R)

3.0

In [14]:
compute_loss(X, X, R)

0.0

### Step 2: Computing the gradient of loss wrt transform matrix R

* Calculate the gradient of the loss with respect to transform matrix `R`.
* The gradient is a matrix that encodes how much a small change in `R`
affect the change in the loss function.
* The gradient gives us the direction in which we should decrease `R`
to minimize the loss.
* $m$ is the number of training examples (number of rows in $X$).
* The formula for the gradient of the loss function $𝐿(𝑋,𝑌,𝑅)$ is:

$$\frac{d}{dR}𝐿(𝑋,𝑌,𝑅)=\frac{d}{dR}\Big(\frac{1}{m}\| X R -Y\|_{F}^{2}\Big) = \frac{2}{m}X^{T} (X R - Y)$$

In [15]:
def compute_gradient(X, Y, R):
    '''
    Inputs: 
        X: a matrix of dimension (m, n) where the columns are the English 
          embeddings.
        Y: a matrix of dimension (m, n) where the columns corresponding to
          the French embeddings.
        R: a matrix of dimension (n, n) - transformation matrix from 
          English to French vector space embeddings.
    Outputs:
        g: a matrix of dimension (n, n) - gradient of the loss function L
          for given X, Y and R.
    '''
    m = X.shape[0]
    gradient = (2/m) * X.T @ (X @ R - Y)
    return gradient

In [16]:
compute_gradient(X, Y, R)

array([[  0.,  -2.,  -5.],
       [  0.,  -3.,  -8.],
       [  0.,  -4., -11.]])

In [17]:
compute_gradient(X, X, R)

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

### Step 3: Finding the optimal R with gradient descent algorithm

#### Gradient descent

[Gradient descent](https://ml-cheatsheet.readthedocs.io/en/latest/gradient_descent.html) is an iterative algorithm which is used in searching for the optimum of the function. 
* Earlier, we've mentioned that the gradient of the loss with respect to the matrix encodes how much a tiny change in some coordinate of that matrix affect the change of loss function.
* Gradient descent uses that information to iteratively change matrix `R` until we reach a point where the loss is minimized. 

Pseudocode:
1. Calculate gradient $g$ of the loss with respect to the matrix $R$.
2. Update $R$ with the formula:
$$R_{\text{new}}= R_{\text{old}}-\alpha g$$

Where $\alpha$ is the learning rate, which is a scalar.

In [18]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):
    '''
    Inputs:
        X: a matrix of dimension (m, n) where the columns are the English 
          embeddings.
        Y: a matrix of dimension (m, n) where the columns corresponding to
          the French embeddings.
        train_steps: positive int - describes how many steps will gradient
          descent algorithm do.
        learning_rate: positive float - describes how big steps will
          gradient descent algorithm do.
    Outputs:
        R: a matrix of dimension (n, n) - the projection matrix that 
          minimizes the F norm ||XR - Y||^2
    '''
    #np.random.seed(129)

    # the number of columns in X is the number of dimensions for a word vector (e.g. 300)
    # R is a square matrix with length equal to the number of dimensions in th  word embedding
    R = np.random.rand(X.shape[1], X.shape[1])

    for i in range(train_steps):
        if i % 25 == 0:
            print(
                f'loss at iteration {i} is: {compute_loss(X, Y, R):.4f}')
        ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
        # use the function that you defined to compute the gradient
        gradient = compute_gradient(X, Y, R)

        # update R by subtracting the learning rate times gradient
        R -= (learning_rate * gradient)
        ### END CODE HERE ###
    return R

In [19]:
R_train = align_embeddings(
    X_train, Y_train, train_steps=1000, learning_rate=0.8)

loss at iteration 0 is: 962.9849
loss at iteration 25 is: 97.8230
loss at iteration 50 is: 26.8209
loss at iteration 75 is: 9.7898
loss at iteration 100 is: 4.3806
loss at iteration 125 is: 2.3311
loss at iteration 150 is: 1.4506
loss at iteration 175 is: 1.0359
loss at iteration 200 is: 0.8268
loss at iteration 225 is: 0.7157
loss at iteration 250 is: 0.6543
loss at iteration 275 is: 0.6192
loss at iteration 300 is: 0.5986
loss at iteration 325 is: 0.5861
loss at iteration 350 is: 0.5785
loss at iteration 375 is: 0.5737
loss at iteration 400 is: 0.5706
loss at iteration 425 is: 0.5687
loss at iteration 450 is: 0.5674
loss at iteration 475 is: 0.5665
loss at iteration 500 is: 0.5659
loss at iteration 525 is: 0.5656
loss at iteration 550 is: 0.5653
loss at iteration 575 is: 0.5651
loss at iteration 600 is: 0.5650
loss at iteration 625 is: 0.5649
loss at iteration 650 is: 0.5648
loss at iteration 675 is: 0.5648
loss at iteration 700 is: 0.5648
loss at iteration 725 is: 0.5647
loss at ite

## 2.2 Testing the translation

### k-Nearest neighbors algorithm

[k-Nearest neighbors algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) 
* k-NN is a method which takes a vector as input and finds the other vectors in the dataset that are closest to it. 
* The 'k' is the number of "nearest neighbors" to find (e.g. k=2 finds the closest two neighbors).

### Searching for the translation embedding
Since we're approximating the translation function from English to French embeddings by a linear transformation matrix $\mathbf{R}$, most of the time we won't get the exact embedding of a French word when we transform embedding $\mathbf{e}$ of some particular English word into the French embedding space. 
* This is where $k$-NN becomes really useful! By using $1$-NN with $\mathbf{eR}$ as input, we can search for an embedding $\mathbf{f}$ (as a row) in the matrix $\mathbf{Y}$ which is the closest to the transformed vector $\mathbf{eR}$

### Cosine similarity
Cosine similarity between vectors $u$ and $v$ calculated as the cosine of the angle between them.
The formula is 

$$\cos(u,v)=\frac{u\cdot v}{\left\|u\right\|\left\|v\right\|}$$

* $\cos(u,v)$ = $1$ when $u$ and $v$ lie on the same line and have the same direction.
* $\cos(u,v)$ is $-1$ when they have exactly opposite directions.
* $\cos(u,v)$ is $0$ when the vectors are orthogonal (perpendicular) to each other.

#### Note: Distance and similarity are pretty much opposite things.
* We can obtain distance metric from cosine similarity, but the cosine similarity can't be used directly as the distance metric. 
* When the cosine similarity increases (towards $1$), the "distance" between the two vectors decreases (towards $0$). 
* We can define the cosine distance between $u$ and $v$ as
$$d_{\text{cos}}(u,v)=1-\cos(u,v)$$

Inputs:
* Vector `v`,
* A set of possible nearest neighbors `candidates`
* `k` nearest neighbors to find.
* The distance metric should be based on cosine similarity.
* `cosine_similarity` function is already implemented and imported for you. It's arguments are two vectors and it returns the cosine of the angle between them.
* Iterate over rows in `candidates`, and save the result of similarities between current row and vector `v` in a python list. Take care that similarities are in the same order as row vectors of `candidates`.
* Now you can use [numpy argsort]( https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html#numpy.argsort) to sort the indices for the rows of `candidates`.

In [27]:
def nearest_neighbor(v, candidates, k=1):
    """
    Input:
      - v, the vector you are going find the nearest neighbor for
      - candidates: a set of vectors where we will find the neighbors
      - k: top k nearest neighbors to find
    Output:
      - k_idx: the indices of the top k closest vectors in sorted form
    """
    sims = []
    for row in candidates:
        cos_similarity = cosine_similarity(v, row)
        sims.append(cos_similarity)
    sorted_ids = np.argsort(sims)
    k_idx = sorted_ids[-k:]
    return k_idx

In [28]:
v = np.array([1, 0, 1])
candidates = np.array(
    [[1, 0, 5], [-2, 5, 3], [2, 0, 1], [6, -9, 5], [9, 9, 9]])
print(candidates[nearest_neighbor(v, candidates, 3)])

[[9 9 9]
 [1 0 5]
 [2 0 1]]


In [46]:
def test_vocabulary(X, Y, R):
    '''
    Input:
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the transform matrix which translates word embeddings from
        English to French word vector space.
    Output:
        accuracy: for the English to French capitals
    '''
    pred = X @ R
    n_correct = 0
    for i, vec in enumerate(pred):
        pred_idx = nearest_neighbor(vec, Y)
        if pred_idx == i:
            n_correct += 1
    accuracy = n_correct / len(pred)
    return accuracy

In [47]:
X_val, Y_val = get_matrices(
    en_fr_test, fr_embeddings_subset, en_embeddings_subset)

In [48]:
X = X_val
Y = Y_val
R = R_train

pred = X @ R
n_correct = 0
nearest_neighbor(pred[0, :], Y, 1)

array([0])

In [49]:
acc = test_vocabulary(X_val, Y_val, R_train)
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.565


# 3. Locality-Sensitive Hashing and document search

In this part of the assignment, you will implement a more efficient version
of k-nearest neighbors using locality sensitive hashing.
You will then apply this to document search.

* Process the tweets and represent each tweet as a vector (represent a
document with a vector embedding).
* Use locality sensitive hashing and k nearest neighbors to find tweets
that are similar to a given tweet.

In [50]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets

### 3.1 Getting the document embeddings

#### Bag-of-words (BOW) document models
Text documents are sequences of words.
* The ordering of words makes a difference. For example, sentences "Apple pie is
better than pepperoni pizza." and "Pepperoni pizza is better than apple pie"
have opposite meanings due to the word ordering.
* However, for some applications, ignoring the order of words can allow
us to train an efficient and still effective model.
* This approach is called Bag-of-words document model.

#### Document embeddings
* Document embedding is created by summing up the embeddings of all words
in the document.
* If we don't know the embedding of some word, we can ignore that word.

In [57]:
len(en_embeddings_subset[list(en_embeddings_subset.keys())[0]])

300

In [66]:
def get_document_embedding(tweet, en_embeddings): 
    '''
    Input:
        - tweet: a string
        - en_embeddings: a dictionary of word embeddings
    Output:
        - doc_embedding: sum of all word embeddings in the tweet
    '''
    embedding_dim = len(
        en_embeddings[list(en_embeddings.keys())[0]])
    doc_embedding = np.zeros(embedding_dim)
    processed_doc = process_tweet(tweet)
    for word in processed_doc:
        doc_embedding += en_embeddings.get(word, np.zeros(embedding_dim))
    return doc_embedding

In [69]:
# testing your function
custom_tweet = ('RT @Twitter @chapagain Hello There! Have a great day. '
                ':) #good #morning http://chapagain.com.np')
tweet_embedding = get_document_embedding(
    custom_tweet, en_embeddings_subset)
tweet_embedding[-5:]

array([-0.00268555, -0.15378189, -0.55761719, -0.07216644, -0.32263184])