## Work through all of the cells in this notebook and answer the questions

### Part I: creating a word embedding using single value decomposition over the 'reuters' corpus

The next several cells load the necessary libraries and define the necessary functions for the example.

In [None]:
### 
# Example of SVD on a co-occurrence matrix from https://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors.html
#
#
# All Import Statements Defined Here
# Note: Do not add to this list.
# All the dependencies you need, can be installed by running .
# ----------------

import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)
# ----------------

In [None]:
def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]

In [None]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    
    vocabD = {}
    for l in corpus:
        for it in l:             
            if not vocabD.get(it):
                vocabD[it] = 1
            else:
                vocabD[it] = vocabD[it] + 1
    
    corpus_words = [x for x in vocabD.keys()]
    corpus_words.sort()
    num_corpus_words = len(corpus_words)
    return corpus_words, num_corpus_words

In [None]:
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape (number of corpus words, number of corpus words)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}
    
    i = 0
    for w in words:
        word2Ind[w] = i
        i += 1
    
    M = np.matrix(np.ndarray(shape=(num_words,num_words), dtype=float, buffer=np.array([0.0]*num_words*num_words)))

    for l in corpus:
        idx = 0
        maxIdx = len(l) - 1
        for it in l:
            curWordIdx = word2Ind[it]
            for j in range(1, window_size + 1):
                if (idx + j) <= maxIdx:
                    targetWordIdx = word2Ind[l[idx + j]]
                    M[curWordIdx,targetWordIdx] += 1.0
                    M[targetWordIdx,curWordIdx] += 1.0
            idx += 1
                    
    
    return M, word2Ind, words



In [None]:
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    svd = TruncatedSVD(n_components = 2, n_iter = n_iters)
    svd.fit(M)
    M_reduced = svd.transform(M)
    print(M_reduced)

    print("Done.")
    return M_reduced



In [None]:
def plot_embeddings(M_reduced, word2Ind, words, text_offset = 0.1):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2Ind.
        Include a label next to each point.
        
        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , k)): matrix of k-dimensioal word embeddings
            word2Ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """
    xvec = []
    yvec = []
    wplotted = []
    for w in words:
        if word2Ind.get(w) is None:
            print('ERROR: word ', w, 'not present in the dictionary of words!')
            return
        
        xvec.append(M_reduced[word2Ind[w]][0])
        yvec.append(M_reduced[word2Ind[w]][1])
        wplotted.append(w)
        
    plt.scatter(xvec, yvec, marker='x', color='red')
    
    for i in range(0,len(xvec)):
        x = xvec[i]
        y = yvec[i]
        if i == len(wplotted):
            break 
                
        plt.text(x + text_offset, y + text_offset, wplotted[i], fontsize=9)
        
            
    plt.show()

Now, we load the corpus, create a co-occurrence matrix (initial and very simple word embeddings), reduce the dimensionality using SVD (the target word embeddings), and then plot a few example words.

In [None]:
reuters_corpus = read_corpus()
pprint.pprint(reuters_corpus[:3], compact=True, width=100)

In [None]:
test_corpus = ["START All that glitters isn't gold END".split(" "), "START All's well that ends well END".split(" ")]
m, d, words = compute_co_occurrence_matrix(test_corpus, window_size=4)
print(words)
print(m)
print(d)

In [None]:
# -----------------------------
# Run This Cell to Produce Your Plot
# ------------------------------
reuters_corpus = read_corpus()
M_co_occurrence, word2Ind_co_occurrence,words = compute_co_occurrence_matrix(reuters_corpus)
print('The co-occurrence matrix for reuters is of shape: ' + str(M_co_occurrence.shape))
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting

In [None]:
words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
plot_embeddings(M_normalized, word2Ind_co_occurrence, words, text_offset = 0.01)

In [None]:
words = ['profit', 'war', 'peace', 'energy', 'environment', 'oil', 'petroleum']
plot_embeddings(M_normalized, word2Ind_co_occurrence, words, text_offset = 0.01)

#### Question 1:  The context window of the co-occurrence matrix above is size 4. In a real word use case, what benefits and potential limitations might there be to using a size of 2? What about a size of 6?

#### Answer: 

#### Question 2:  How could text preprocessing potentially benefit or harm the resulting word embeddings?

#### Answer: 

#### Question 3:  True/False - the SVD produces a 2-dimensional representation of each word in the term dictionary created from the reuters corpus?

#### Answer: 

#### Question 4:  How large of a dimension reduction did SVD provide in this example?

#### Answer: 

### Part II: using word2vec word embeddings

The following cells load word embeddings from word2vec and demonstrate some of their features.

In [None]:
from zipfile import ZipFile
with ZipFile('../w2v_words_pickle.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [None]:
## We access word2vec as a service because the data structure takes up more than 6gigs. 
## To do that, we start by loading a cached file of all 3,000,000 'words' (tokens)  
import pickle
f = open('../w2v_words.pickle','rb')
words = pickle.load(f)
f.close()

In [None]:
## Now, 30,000 words are randomly sampled 
import random
print("Shuffling words ...")
random.shuffle(words)
words = words[:30000]

In [None]:
## And then, the word embedding vector for each word is retreived from the word2vec service (can take up to 10 minutes)

required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 
                'oil', 'output', 'petroleum', 'venezuela',
                'profit', 'war', 'peace', 'environment'
               ]
words = set(words).union(set(required_words))
print("Retreiving vectors for %i words..." % len(words))

import requests, json 
word2Ind = {}
M = []
curInd = 0
for w in words:
    try:
        r = requests.get('http://localhost:8000/word2vec?word=' + w)
        if r.status_code == 200: 
            vec = json.loads(r.text)
            M.append(vec["data"])
            word2Ind[w] = curInd
            curInd += 1
    except KeyError:
        continue
print("Vectors for the words are now in word2Ind and list M.")

In [None]:
## convert the list to a Matrix
M = np.asarray(M,np.float64)
M = np.stack(M)
print("Take note of M.shape: " + str(M.shape))

In [None]:
## Only for the purpose of visualization - reduce the dimensionality using SVD
M_reduced = reduce_to_k_dim(M, k=2)

In [None]:
words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
plot_embeddings(M_reduced, word2Ind, words,text_offset = 0.01)

In [None]:
words = ['profit', 'war', 'peace', 'energy', 'environment', 'oil', 'petroleum']
plot_embeddings(M_reduced, word2Ind, words,text_offset = 0.01)

#### Question 4: Based on the plots, what is your impression the results of word2vec embedding vs the simple co-occurrence approach?

#### Answer: 

#### Question 5: True/False:  The full 300 dimension word2vec embedding for each word can be used as input to IR and machine learning NLP algorithms.

#### Answer: 

#### Question 6: Consider the use case of classifying clinical notes e.g., for mentions of adverse drug events. What potential advantage would the word2vec embeddings have over simple preprocessed terms? Embeddings based on simple co-occurrence? Any disadvantages?

#### Answer: 