In [41]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import numpy.matlib
import numpy.linalg as la
from numpy import inf
import functools
%matplotlib

Using matplotlib backend: MacOSX


In [42]:
file = open('function_words.csv','r')
tmp = file.read()
fwords = tmp.split('\n')
fwords = fwords[:-1]
# Create a helper dictionary
fwords_dict = {}
for idx, word in enumerate(fwords):
    fwords_dict[word] = idx+1

In [43]:
# Edit: modified to allow self-loops

def functionWordWAN(sentence, fwords_dict, window_size, alpha):
    wan = np.zeros((len(fwords_dict), len(fwords_dict)))
    for idx, pivot in enumerate(sentence[:-1]): # don't include last word as pivot
        if (not fwords_dict.get(pivot)):
            continue
        else:
            sentence_slice = sentence[idx:(idx+window_size)]
            for it, word in enumerate(sentence_slice[1:]): # don't include pivot word
                if (fwords_dict.get(word)):
#                     if (fwords_dict.get(word) != fwords_dict.get(pivot)): # no self loops
                    r = fwords_dict.get(pivot)
                    c = fwords_dict.get(word)
                    wan[r-1,c-1] += pow(alpha,it+1)
                           
    return wan

In [44]:
def normalizeWAN(raw_wan):
    sums = raw_wan.sum(axis=1)
    sums = np.matlib.repmat(sums,len(fwords),1)
    sums = sums.T
    norm_wan = raw_wan / sums
    norm_wan = np.nan_to_num(norm_wan) # Make sure nans from zero division are zeros
    return norm_wan

In [None]:
def generateWANPair(snt1, snt2, window_size, alpha):
    ret_data = {}
    snt1_set = set(snt1)
    snt2_set = set(snt2)
    comp_set = snt1_set.union(snt2_set)
    comp_dict = {}
    # Generate the composite sentence dictionary
    for idx, v in enumerate(comp_set):
        comp_dict[v] = idx+1
    
    ret_data['snt1_wan'] = functionWordWAN(snt1, comp_dict, window_size, alpha)
    ret_data['snt2_wan'] = functionWordWAN(snt2, comp_dict, window_size, alpha)
    
    return ret_data
        

## A simple Test

In [47]:
alpha = 0.75
window_size = 10
sentence1= ['the','cat','in','the','hat','bought','a','baseball','bat','i','am','happy']
sentence2 = ['i','like','the','cat','in','the','hat','who','bought','a','baseball','bat']

In [48]:
wan1 = functionWordWAN(sentence1, fwords_dict, window_size, alpha)
wan2 = functionWordWAN(sentence2, fwords_dict, window_size, alpha)

In [49]:
norm_wan1 = normalizeWAN(wan1)
norm_wan2 = normalizeWAN(wan2)

## Relative Entropy

In [50]:
def relativeEntropy(wan1, wan2):
    # Return is a list containing 1 w.r.t 2, then 2 w.r.t. 1
    data = {}
    data['entropies'] = [0.0, 0.0]
    
    limiting1 = la.matrix_power(wan1, 50) # 25 selected as reasonable convergence value
    limiting2 = la.matrix_power(wan2, 50)
    
    data['l1'] = limiting1
    data['l2'] = limiting2
    
    # 1 w.r.t 2
    imd = np.nan_to_num(np.divide(wan1,wan2)) # Set all nan's to zero (0/0)
    imd[(imd == inf) | (imd == -inf)] = 0 # All infinities to 0 (sc alar/0)
    log_imd = np.nan_to_num(np.log(imd))
    log_imd[(log_imd == inf) | (log_imd == -inf)] = 0    
    weights = functools.reduce(np.multiply, [limiting1, wan1, log_imd])
    data['entropies'][0] = weights.sum()
    
    # 2 w.r.t 1
    imd = np.nan_to_num(np.divide(wan2,wan1)) # Set all nan's to zero (0/0)
    imd[(imd == inf) | (imd == -inf)] = 0 # All infinities to 0 (scalar/0)
    log_imd = np.nan_to_num(np.log(imd))
    log_imd[(log_imd == inf) | (log_imd == -inf)] = 0    
    weights = functools.reduce(np.multiply, [limiting2, wan2, log_imd])
    data['entropies'][1] = weights.sum()
    
    return data

In [51]:
data = relativeEntropy(norm_wan1, norm_wan2)

In [52]:
plt.imshow(data['l2'],interpolation='nearest')

<matplotlib.image.AxesImage at 0x1133a4ba8>

In [53]:
data

{'entropies': [1.0080744990356447e-10, 9.9526899026165093e-14],
 'l1': array([[  3.01533594e-13,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        ..., 
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]]),
 'l2': array([[  3.26822495e-17,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0