## Example 1:
Counting words in a document
- Word co-occurence implementation with Alice in Wonderland
- Word similarity with cosine similarity

Some example plots:
- Ch15, Fig. 15.3 for small corpus


In [36]:
import numpy as np
np.random.seed(13)
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from IPython.display import SVG
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from itertools import islice
from matplotlib import pylab
from __future__ import division

In [41]:
# DO NOT Modify the lines in this cell
path = 'alice.txt'
# JG> Read and store the first 700 words (text representation)
# For testing, the example provided
#corpus = ["The dog chased the cat away from the garden."]
corpus = open(path).readlines()[0:700] 
print("Original number of lines in corpus:", len(corpus))

# JG> Retrieving the sentences having at least 3 words (at least 2 white spaces), resulting in 560 lines
# JG> Note that this removes line breaks also. 
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]
print("Number of lines in corpus after first filter:", len(corpus))


# JG> List of characters to filter out (ex: punctiation, etc. ) I guess it is the default plus the single quote.
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")

# JG> Train (why???)
tokenizer.fit_on_texts(corpus)  #Thanos: #list of texts to train on

# Encode all document into numbers (how? Apparently it is not important. Furthermore,
# documentation is not describing anything)
# A sequence is "a list of word indexes, where the word of rank i in the dataset (starting at 1) has index i"
corpus = tokenizer.texts_to_sequences(corpus)

# JG> Number of words in the corpus.
# JG> Each s is a list, where each elements encodes a word into a number. Then, computing the length of s
# is the same than counting the words in the line.
nb_samples = sum(len(s) for s in corpus) 
# JG> Get the length of the word_index, where each element in the document is represented
V = len(tokenizer.word_index) + 1 # JG> Why plus 1?

print("Number of words is:", nb_samples)
print("Length of the word index is (unique words):", V)


# Is this something they need to change?
dim = 100
window_size = 2
window_size_corpus = 4

Original number of lines in corpus: 700
Number of lines in corpus after first filter: 560
Number of words is: 6563
Length of the word index is (unique words): 1183


###### Word co-occurrence matrix for _The dog chased the cat away from the garden_
---------------------------------------------------------
|     | The | dog | chased | cat | away | from | garden |
|-----|-----|-----|--------|-----|------|------|--------|
| The | 0 | 2 | 2 | 3 | 2 | 2 | 1 |
|dog | 2 | 0 | 1 | 1 | 1 | 0 | 0 |
|chased | 2 | 1 | 0 | 1 | 1 | 1 | 0 |
|cat | 3 | 1 | 1 | 0 | 1 | 1 | 1 |
|away | 2 | 1 | 1 | 1 | 0 | 1 | 1 |
|from | 2 | 0 | 1 | 1 | 1 | 0 | 1 |
|garden | 1 | 0 | 0 | 1 | 1 | 1 | 0 |

This is an example of a word co-occurence matrix of only one sentence. Create a word co-occurrence matrix for Alice in Wonderland

In [42]:
################################################################################
################ Computing word-word co-ocurrence matrix #######################
################################################################################

# Not changing V in the cell they do not want us to modify, but one never uses a 
# size variable to handle an iteration issue.
#V = len(tokenizer.word_index)

# Create my co-ocurrence matrix, initially 0 (VxV size)
# How to access this matrix:
#   Each column is an (index - 1) (column 0 is word in index 1; column 1 is word in index 2; ...)
wcoMatrix = np.zeros([V, V]) 

# Using my window_size_corpus to define my context scope
scope = window_size_corpus
# TODO: Do not make that many iterations
# Greedy approach first (to be able to compare the optimization)

# For each line in the corpus. Note that they preserve the order, even when they are indexes now.
for s in corpus:
    # For each word
    for current_index in range(0, len(s)):
        current_value = s[current_index] # Get the 'word'
        # From left to right
        for neighbor_index in range(current_index - scope, current_index + scope + 1):
            # Never out of boundaries and never the same index
            if ( neighbor_index >= 0 and neighbor_index < len(s) ) and ( neighbor_index != current_index ):
                # Get my neighbor 'word'
                neighbor = s[neighbor_index] 
                # Never myself and myself
                if current_value != neighbor:
                    # Update the ocurrence
                    wcoMatrix[current_value, neighbor] += 1


# Keep diagonal as zero (not needed now, but keepin the line as a comment)
#np.fill_diagonal(wcoMatrix, 0)
wcoMatrix



array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 71., ...,  1.,  0.,  0.],
       [ 0., 71.,  0., ...,  1.,  1.,  1.],
       ...,
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

In [60]:
# Compute similarity between the words Alice, Rabbit and Dinah
full_index = tokenizer.word_index
#print(full_index) words+indices
full_index

words_to_compare = ["Alice", "Rabbit", "Dinah"]

#words_to_compare = ["cat", "dog"]
for w1 in words_to_compare:
    w1 = w1.lower()
    x = full_index[w1] 
    for w2 in words_to_compare:
        w2 = w2.lower()
        if w1 != w2:
            y = full_index[w2] - 1
            X = wcoMatrix[x, :].reshape((1, V))   #reshape cause of stupid type error 
            Y = wcoMatrix[y, :].reshape((1, V))
            print("cosine_similarity(%s, %s)=%s" % (w1, w2, cosine_similarity(X, Y)))

alice
cosine_similarity(alice, rabbit)=[[0.52599973]]
alice
cosine_similarity(alice, dinah)=[[0.5331712]]
rabbit
cosine_similarity(rabbit, alice)=[[0.69058505]]
rabbit
cosine_similarity(rabbit, dinah)=[[0.53097865]]
dinah
cosine_similarity(dinah, alice)=[[0.41641293]]
dinah
cosine_similarity(dinah, rabbit)=[[0.59928328]]


In [44]:
# Retrieve the five most similar words to Alice with nearest neighbors


## Example 2:
Word embedding (dense) comparisons
- Load the pre-trained word embeddings of word2vec
- See whether the differences between the following word pairs are similar:
    - _A king is to a queen as a man is to a woman_
    - _A cat is to a kitten as a dog is to a puppy_
    - _Cats are to a cat as dogs are to a dog_
- Compare the following synonyms and antonyms:
    - Unhappy and happy
    - Happy and cheerful
    - Unhappy and cheerful
    - Synonym and equivalence
    - Synonym and antonym
    

Download word2vec here: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit


In [5]:
#load word2vec
word2vec = KeyedVectors.load_word2vec_format("your path to word2vec")


In [1]:
#perform gensim tasks