## Set 5
## 3. Word2Vec \*\*Principles**

#### Preparation

In [1]:
import requests
url_dict = {
    'dr_seuss.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/set5/data/dr_seuss.txt',
    'P3CHelpers.py': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/set5/code/P3CHelpers.py'
}
def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

In [2]:
# Download files
download_file('dr_seuss.txt')
download_file('P3CHelpers.py')

Start downloading...
Complete
Start downloading...
Complete


In [3]:
import numpy as np
from P3CHelpers import *
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
import numpy as np

##########################
# Helper functions/classes
##########################

class WordPair:
    """
    Class representing a pair of words in our vocabulary, along with the cosine similarity
    of the two words.
    """
    def __init__(self, firstWord, secondWord, similarity):
        """
        Initializes the WordPair given two words (strings) and their similarity (float).
        """
        # Ensure that our pair consists of two distinct words
        assert(firstWord != secondWord)
        self.firstWord = firstWord
        self.secondWord = secondWord
        self.similarity = similarity

    def __repr__(self):
        """
        Define the string representation of a WordPair so that a WordPair instance x
        can be displayed using print(x).
        """
        return "Pair(%s, %s), Similarity: %s"%(self.firstWord, self.secondWord, self.similarity)


def sort_by_similarity(word_pairs):
    """
    Given a list of word pair instances, returns a list of the instances sorted
    in decreasing order of similarity.
    """
    return sorted(word_pairs, key=lambda pair: pair.similarity, reverse=True)

def get_similarity(v1, v2):
    """ Returns the cosine of the angle between vectors v1 and v2. """
    v1_unit = v1 / np.linalg.norm(v1)
    v2_unit = v2 / np.linalg.norm(v2)
    return np.dot(v1_unit, v2_unit)


def load_word_list(path):
    """
    Loads a list of the words from the file at path <path>, removing all
    non-alpha-numeric characters from the file.
    """
    with open(path) as handle:
        # Load a list of whitespace-delimited words from the specified file
        raw_text = handle.read().strip().split()
        # Strip non-alphanumeric characters from each word
        alphanumeric_words = map(lambda word: ''.join(char for char in word if char.isalnum()), raw_text)
        # Filter out words that are now empty (e.g. strings that only contained non-alphanumeric chars)
        alphanumeric_words = filter(lambda word: len(word) > 0, alphanumeric_words)
        # Convert each word to lowercase and return the result
        return list(map(lambda word: word.lower(), alphanumeric_words))

def generate_onehot_dict(word_list):
    """
    Takes a list of the words in a text file, returning a dictionary mapping
    words to their index in a one-hot-encoded representation of the words.
    """
    word_to_index = {}
    i = 0
    for word in word_list:
        if word not in word_to_index:
            word_to_index[word] = i
            i += 1
    return word_to_index

def most_similar_pairs(weight_matrix, word_to_index):
    """
    For each word a in our vocabulary, computes the most similar word b to a, along with the
    cosine similarity of a and b.

    Arguments:
        weight_matrix: The matrix of weights extracted from the hidden layer of a fitted
                       neural network.

        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

    Returns:
        A list of WordPair instances sorted in decreasing order of similarity,
        one representing each word <vocab_word> and its most similar word.
    """
    word_to_feature_repr = get_word_to_feature_repr(weight_matrix, word_to_index)
    result = []
    for word in word_to_feature_repr:
        result.append(most_similar_word(word_to_feature_repr, word))
    return sort_by_similarity(result)

def most_similar_word(word_to_feature_repr, input_word):
    """
    Given a dictionary mapping words to their feature representations (word_to_feature_repr),
    returns the a WordPair instance corresponding to the word
    whose feature vector is most similar to the feature representation of the
    passed-in word (input_word).
    """
    best_word = None
    best_similarity = 0
    input_vec = word_to_feature_repr[input_word]
    for word, feature_vec in word_to_feature_repr.items():
        similarity = get_similarity(input_vec, feature_vec)
        if similarity > best_similarity and np.linalg.norm(feature_vec - input_vec) != 0:
            best_similarity = similarity
            best_word = word
    return WordPair(input_word, best_word, best_similarity)

def get_word_to_feature_repr(weight_matrix, word_to_index):
    """
    Returns a dictionary mapping each word in our vocabulary to its one-hot-encoded
    feature representation.

    Arguments:
        weight_matrix: The matrix of weights extracted from the hidden layer of a fitted
                       neural network.

        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.
    """
    assert(weight_matrix is not None)
    word_to_feature_repr = {}
    for word, one_hot_idx in word_to_index.items():
        word_to_feature_repr[word] = weight_matrix[one_hot_idx]
    return word_to_feature_repr

#### Problem D:
Fill in the generate_traindata and find_most_similar_pairs functions.

In [5]:
def get_word_repr(word_to_index, word):
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          Word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window. Defaults to 2
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training
                              points (one-hot-encoded vectors) and their corresponding output_word
                              (also one-hot-encoded vectors)

    """
    trainX = []
    trainY = []
    for word in word_list:
      idx = word_to_index[word]
      for i in range(max(0, idx - window_size), min(idx + window_size + 1, len(word_to_index))):
        if i != idx:
          trainX.append(get_word_repr(word_to_index, word))
          trainY.append(get_word_repr(word_to_index, word_list[i]))
    return np.array(trainX), np.array(trainY)

In [6]:
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text

    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)

    # Create word dictionary
    word_to_index = generate_onehot_dict(sample_text)
    print("Textfile contains %s unique words"%len(word_to_index))
    # Create training data
    trainX, trainY = generate_traindata(sample_text, word_to_index)

    # vocab_size = number of unique words in our text file. Will be useful
    # when adding layers to your neural network
    vocab_size = len(word_to_index)
    model = nn.Sequential(
        nn.Linear(vocab_size, num_latent_factors, bias=False),
        nn.Linear(num_latent_factors, vocab_size, bias=False),
    )
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    trainX = torch.tensor(trainX, dtype=torch.float32)
    trainY = torch.tensor(trainY, dtype=torch.float32)

    num_epochs = 10
    for epoch in range(num_epochs):
      total_loss = 0.0
      logits = model(trainX)
      loss = loss_function(logits, trainY)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      total_loss += loss.item()

      print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}")

    # set weights variable below
    weights = model[0].weight.data.T
    print("hidden layer shape: ", model[0].weight.shape)
    print("output layer shape: ", model[1].weight.shape)
    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)

### Problem E-H:
Run your model on drseuss.txt and answer questions from E through H.

In [7]:
find_most_similar_pairs('dr_seuss.txt', 10)

Textfile contains 308 unique words
Epoch [1/10], Loss: 5.7302
Epoch [2/10], Loss: 5.7302
Epoch [3/10], Loss: 5.7302
Epoch [4/10], Loss: 5.7302
Epoch [5/10], Loss: 5.7302
Epoch [6/10], Loss: 5.7302
Epoch [7/10], Loss: 5.7302
Epoch [8/10], Loss: 5.7302
Epoch [9/10], Loss: 5.7302
Epoch [10/10], Loss: 5.7302
hidden layer shape:  torch.Size([10, 308])
output layer shape:  torch.Size([308, 10])
Pair(teeth, big), Similarity: 0.96105033
Pair(big, teeth), Similarity: 0.96105033
Pair(please, book), Similarity: 0.9367386
Pair(book, please), Similarity: 0.9367386
Pair(fingers, let), Similarity: 0.9252065
Pair(let, fingers), Similarity: 0.9252065
Pair(long, hills), Similarity: 0.92154247
Pair(hills, long), Similarity: 0.92154247
Pair(glad, at), Similarity: 0.91546226
Pair(at, glad), Similarity: 0.91546226
Pair(out, joe), Similarity: 0.9017944
Pair(joe, out), Similarity: 0.9017944
Pair(here, daniel), Similarity: 0.8991302
Pair(daniel, here), Similarity: 0.8991302
Pair(another, upon), Similarity: 0.8