In [23]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import numpy.matlib
import numpy.linalg as la
from numpy import inf
import functools
%matplotlib

Using matplotlib backend: MacOSX


In [3]:
file = open('function_words.csv','r')
tmp = file.read()
fwords = tmp.split('\n')
fwords = fwords[:-1]
# Create a helper dictionary
fwords_dict = {}
for idx, word in enumerate(fwords):
    fwords_dict[word] = idx+1

In [4]:
def functionWordWAN(sentence, fwords_dict, window_size, alpha):
    wan = np.zeros((len(fwords_dict), len(fwords_dict)))
    for idx, pivot in enumerate(sentence[:-1]): # don't include last word as pivot
        if (not fwords_dict.get(pivot)):
            continue
        else:
            sentence_slice = sentence[idx:(idx+window_size)]
            for it, word in enumerate(sentence_slice[1:]): # don't include pivot word
                if (fwords_dict.get(word)):
                    if (fwords_dict.get(word) != fwords_dict.get(pivot)): # no self loops
                        r = fwords_dict.get(pivot)
                        c = fwords_dict.get(word)
                        wan[r-1,c-1] += pow(alpha,it+1)
                           
    return wan

In [5]:
def normalizeWAN(raw_wan):
    sums = raw_wan.sum(axis=1)
    sums = np.matlib.repmat(sums,len(fwords),1)
    sums = sums.T
    norm_wan = raw_wan / sums
    norm_wan = np.nan_to_num(norm_wan) # Make sure nans from zero division are zeros
    return norm_wan

In [6]:
wan = functionWordWAN(fwords, fwords_dict, 10, 0.75)

In [7]:
# normalize WAN rows to represent as markov chain
norm_wan = normalizeWAN(wan)
# Optional visualize as result of test
# plt.imshow(norm_out)
# plt.show()

# Optional sum across columns (will show all 1's except 0 in last position)
# norm_wan.sum(axis=1)

## A simple Test

In [13]:
alpha = 0.75
sentence = ['the','cat','in','the','hat','bought','a','baseball','bat','i','am','happy']
window_size = 6
sentence_1 = ['i','like','the','cat','in','the','hat','who','bought','a','baseball','bat']

In [14]:
wan2 = functionWordWAN(sentence, fwords_dict, 10, alpha)
wan3 = functionWordWAN(sentence_1, fwords_dict, 10, alpha)

In [15]:
norm_wan2 = normalizeWAN(wan2)
norm_wan3 = normalizeWAN(wan3)

## Relative Entropy

In [12]:
def relativeEntropy(wan1, wan2):
    # Return is a list containing 1 w.r.t 2, then 2 w.r.t. 1
    entropies = [0, 0]
    
    limiting1 = la.matrix_power(wan1, 25) # 25 selected as reasonable convergence value
    limiting2 = la.matrix_power(wan2, 25)
    
    # 1 w.r.t 2
    imd = np.nan_to_num(np.divide(wan1,wan2)) # Set all nan's to zero (0/0)
    imd[(imd == inf) | (imd == -inf)] = 0 # All infinities to 0 (scalar/0)
    log_imd = np.nan_to_num(np.log(imd))
    log_imd[(log_imd == inf) | (log_imd == -inf)] = 0    
    weights = functools.reduce(np.multiply, [limiting1, wan1, log_imd])
    entropies[0] = sum(sum(weights))
    
    # 2 w.r.t 1
    imd = np.nan_to_num(np.divide(wan2,wan1)) # Set all nan's to zero (0/0)
    imd[(imd == inf) | (imd == -inf)] = 0 # All infinities to 0 (scalar/0)
    log_imd = np.nan_to_num(np.log(imd))
    log_imd[(log_imd == inf) | (log_imd == -inf)] = 0    
    weights = functools.reduce(np.multiply, [limiting2, wan2, log_imd])
    entropies[1] = sum(sum(weights))
    
    return entropies

In [26]:
relativeEntropy(norm_wan2, norm_wan3)

[9.3523620479563405e-06, 4.2630042948802758e-07]

In [25]:
norm_wan3.sum(axis=1)

array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])