### Import Libraries

In [1]:
from itertools import chain 
import numpy as np
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation

from __future__ import division

%matplotlib inline

### Install Watermark - tool to help with reproducibility:

In [None]:
%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark/watermark.py

In [2]:
%load_ext watermark
%watermark -n -t -z -u -m -v -p nltk,numpy,scipy

last updated: Tue Jul 12 2016 18:56:11 CDT

CPython 2.7.11
IPython 4.0.3

nltk 3.0.3
numpy 1.10.1
scipy 0.17.0

compiler   : GCC 4.2.1 (Apple Inc. build 5577)
system     : Darwin
release    : 15.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


### Read the data

In [3]:
%%time
file_name = '/Users/elisa/Documents/CompLing/compSemantics/HW3/wikicorpus.txt'
sents = []
with open(file_name, 'rb') as f:
    for line in f:
        if line.startswith("<c> "):
            line = line.decode('cp1252') #convert from Windows Latin-1 encoding to avoid unicode issues
            tagged_words = line.split(" ")[1:] #skip the <c>
            sents.append([tagged_word.split('|') for tagged_word in tagged_words])
print sents[:3]

[[[u'Anarchism', u'Anarchism', u'NNP', u'I-NP', u'O', u'N'], [u'.', u'.', u'.', u'O', u'O', u'.\n']], [[u'Anarchism', u'Anarchism', u'NNP', u'I-NP', u'O', u'N'], [u'is', u'be', u'VBZ', u'I-VP', u'O', u'(S[dcl]\\NP)/NP'], [u'a', u'a', u'DT', u'I-NP', u'O', u'NP[nb]/N'], [u'political', u'political', u'JJ', u'I-NP', u'O', u'N/N'], [u'philosophy', u'philosophy', u'NN', u'I-NP', u'O', u'N'], [u'encompassing', u'encompass', u'VBG', u'I-VP', u'O', u'(S[ng]\\NP)/NP'], [u'theories', u'theory', u'NNS', u'I-NP', u'O', u'N'], [u'and', u'and', u'CC', u'I-NP', u'O', u'conj'], [u'attitudes', u'attitude', u'NNS', u'I-NP', u'O', u'N'], [u'which', u'which', u'WDT', u'B-NP', u'O', u'(NP\\NP)/(S[dcl]\\NP)'], [u'consider', u'consider', u'VBP', u'I-VP', u'O', u'((S[dcl]\\NP)/(S[to]\\NP))/NP'], [u'the', u'the', u'DT', u'I-NP', u'O', u'NP[nb]/N'], [u'state', u'state', u'NN', u'I-NP', u'O', u'N'], [u'to', u'to', u'TO', u'I-VP', u'O', u'(S[to]\\NP)/(S[b]\\NP)'], [u'be', u'be', u'VB', u'I-VP', u'O', u'(S[b]\\NP)

### Clean up the data
#### Remove stop words, 1 char words, punctuation, then lowercase and strip punctuation

In [4]:
%%time
stopwords_set = set(stopwords.words('english'))
punctuation_set = set(punctuation)
tag_index = 1 #lemma
cleaned_sents = []
for sent in sents:
    cleaned_sent = []
    for word in sent:
        if word[0].lower() not in stopwords_set and word[0] not in punctuation_set and len(word[0].strip())>1 :
            word[tag_index] = word[tag_index].lower().strip(punctuation)
            cleaned_sent.append(word)
    cleaned_sents.append(cleaned_sent)
print cleaned_sents[:3]

[[[u'Anarchism', u'anarchism', u'NNP', u'I-NP', u'O', u'N']], [[u'Anarchism', u'anarchism', u'NNP', u'I-NP', u'O', u'N'], [u'political', u'political', u'JJ', u'I-NP', u'O', u'N/N'], [u'philosophy', u'philosophy', u'NN', u'I-NP', u'O', u'N'], [u'encompassing', u'encompass', u'VBG', u'I-VP', u'O', u'(S[ng]\\NP)/NP'], [u'theories', u'theory', u'NNS', u'I-NP', u'O', u'N'], [u'attitudes', u'attitude', u'NNS', u'I-NP', u'O', u'N'], [u'consider', u'consider', u'VBP', u'I-VP', u'O', u'((S[dcl]\\NP)/(S[to]\\NP))/NP'], [u'state', u'state', u'NN', u'I-NP', u'O', u'N'], [u'unnecessary', u'unnecessary', u'JJ', u'I-ADJP', u'O', u'S[adj]\\NP'], [u'harmful', u'harmful', u'JJ', u'I-ADJP', u'O', u'S[adj]\\NP'], [u'and/', u'and', u'JJ', u'I-ADJP', u'O', u'S[adj]\\NP'], [u'undesirable', u'undesirable', u'JJ', u'I-ADJP', u'O', u'S[adj]\\NP']], [[u'Specific', u'specific', u'NNP', u'I-NP', u'O', u'N/N'], [u'anarchists', u'anarchist', u'NNS', u'I-NP', u'O', u'N'], [u'may', u'may', u'MD', u'I-VP', u'O', u'(S[d

### Look at lemmas of just nouns for our targets

In [5]:
target_sents = [[word[tag_index] for word in sent
                  if word[2].startswith('N')] for sent in cleaned_sents] 
print target_sents[:3]

[[u'anarchism'], [u'anarchism', u'philosophy', u'theory', u'attitude', u'state'], [u'specific', u'anarchist', u'criterion', u'anarchism', u'criterion']]


### Get noun lemmas that occur at least 50 times

In [6]:
target_text = [" ".join(target_sent) for target_sent in target_sents]

In [7]:
%%time
def my_tokenizer(s):
    return s.split()
v_target = CountVectorizer(ngram_range=(1,1), token_pattern='(?u)\b\S.*\b', tokenizer=my_tokenizer, min_df=50)
unigram_matrix = v_target.fit_transform(target_text)
#sort
features_count = unigram_matrix.sum(axis=0).tolist()[0]
features_names = v_target.get_feature_names()
sorted_counts = sorted(zip(features_names, features_count), key=lambda count: count[1], reverse=True)
print len(v_targetget.vocabulary_.keys())
print sorted_counts[:10]

[(u'time', 14036L), (u'year', 13207L), (u'system', 9913L), (u'city', 9647L), (u'number', 9520L), (u'world', 9081L), (u'state', 8448L), (u'part', 7911L), (u'example', 7226L), (u'century', 7143L)]
CPU times: user 6.16 s, sys: 1.47 s, total: 7.63 s
Wall time: 8.79 s


#### How big is our  vocabulary?

In [8]:
corpus_sents = [[word[tag_index] for word in sent] for sent in cleaned_sents] 
corpus_words = set([corpus_sent for corpus_sent in chain.from_iterable(corpus_sents)])
print "Original vocabulary length:", len(corpus_words)

Original vocabulary length: 206121


#### Get rid of infrequent words

In [9]:
%%time
min_frequency_target_words = 20
corpus_text = [" ".join(corpus_sent) for corpus_sent in corpus_sents]
v_corpus = CountVectorizer(ngram_range=(1,1), tokenizer=my_tokenizer, #use my own tokenizer so it doesn't split on -
                           token_pattern='(?u)\b\S.*\b',              #use my own token pattern to accept hyphenated words
                           min_df=min_frequency_target_words)         #ignore words that don't occur at least x times
corpus_matrix = v_corpus.fit_transform(corpus_text)
corpus_count = corpus_matrix.sum(axis=0).tolist()[0]
filtered_vocab_length =  len(v_corpus.vocabulary_)
print "Filtered vocabulary length:", filtered_vocab_length

Filtered vocabulary length: 18008
CPU times: user 10.1 s, sys: 8.14 s, total: 18.2 s
Wall time: 21.6 s


In [10]:
%%time
words_to_filter = corpus_words.difference(v_corpus.vocabulary_.keys())
filtered_corpus_sents = [[word for word in sent if word not in words_to_filter] for sent in corpus_sents]
print filtered_corpus_sents[:3]

[[u'anarchism'], [u'anarchism', u'political', u'philosophy', u'encompass', u'theory', u'attitude', u'consider', u'state', u'unnecessary', u'harmful', u'and', u'undesirable'], [u'specific', u'anarchist', u'may', u'additional', u'criterion', u'constitute', u'anarchism', u'often', u'disagree', u'criterion']]
CPU times: user 4.1 s, sys: 8.13 s, total: 12.2 s
Wall time: 16.2 s


### Get collocations
#### stop at sentence boundary
* T x C matrix, where T=target words (rows) and C=context words (columns)  
* use lil_matrix because it is efficient for building sparse matrices  
* Note: We technically don't need to build a full co-occurence matrix because our target words are just the top 50 nouns, but for flexibliity we will build a C x C matrix so that we can easily switch to a different set of target words later.

In [11]:
%%time
window_size = 2
rows = filtered_vocab_length
cols = filtered_vocab_length
target_words = set(v_target.vocabulary_.keys())
target_context_matrix = lil_matrix((rows, cols), dtype = np.int)
for context_sent in filtered_corpus_sents:
    for target_index, target_word in enumerate(context_sent):
        if(target_word in target_words):
            #AFTER context
            for i in xrange(1,window_size+1):
                if(target_index+i) >= len(context_sent):
                    break #reached end of sentence!
                else:
                    context_word = context_sent[target_index+i]
                    target_context_matrix[v_corpus.vocabulary_.get(target_word),
                                          v_corpus.vocabulary_.get(context_word)] += 1
            #BEFORE context
            for i in xrange(1,window_size+1):
                if(target_index-i) < 0:
                    break #reached end of sentence!
                else:
                    context_word = context_sent[target_index-i]
                    target_context_matrix[v_corpus.vocabulary_.get(target_word),
                                          v_corpus.vocabulary_.get(context_word)] += 1
print target_context_matrix.shape

(18008, 18008)
CPU times: user 3min 47s, sys: 21.3 s, total: 4min 8s
Wall time: 4min 52s


Sanity check: print co-occurence of time with year, 
should be 213

In [14]:
print target_context_matrix[(v_corpus.vocabulary_.get('time'),v_corpus.vocabulary_.get('year'))]

213


### Remove rows and columns with zero values (to avoid problems with division later)

In [15]:
# remove the rows        
target_row_sums = target_context_matrix.sum(axis = 1) #sum across the rows
target_word_index_no_zeros = []
target_vocab_no_zeros = []
original_word_index = v_corpus.get_feature_names()
for word_index in range(0,len(original_word_index)):
    if target_row_sums[ word_index ] != 0:
        target_word_index_no_zeros.append(word_index)
        target_vocab_no_zeros.append(original_word_index[word_index])
target_context_matrix_no_zeros = target_context_matrix[target_word_index_no_zeros, :]
print target_context_matrix_no_zeros.shape

# remove the columns
context_row_sums = target_context_matrix.sum(axis = 0) #sum down the columns
context_word_index_no_zeros = []
context_vocab_no_zeros = []
for word_index in range(0,len(original_word_index)):
    if context_row_sums[ 0,word_index ] != 0:
        context_word_index_no_zeros.append(word_index)
        context_vocab_no_zeros.append(original_word_index[word_index])
target_context_matrix_no_zeros = target_context_matrix_no_zeros[:, context_word_index_no_zeros]
print target_context_matrix_no_zeros.shape

(6704, 18008)
(6704, 18008)


### Calculate association measures
#### <font color='red'>Important:</font> need to convert our co-occurence lil matrix to a csr matrix beore we do any mathemtical operations!
1. $ PMI(target,context) = log\frac{P(target,context)}{P(target) P(context)} $  
  * $P(target,context) = \frac{count(target,context)}{count(\_\_,\_\_)}$  
  * $P(target) = \frac{count(target,\_\_)}{count(\_\_,\_\_)}$  
  * $P(context) = \frac{count(context,\_\_)}{count(\_\_,\_\_)}$  

In [16]:
%%time
target_context_matrix = target_context_matrix_no_zeros.tocsr()
count_all = target_context_matrix.sum()
count_target = target_context_matrix.sum(axis = 1) #rows
count_context = target_context_matrix.sum(axis = 0) #columns

prob_target = count_target / count_all
prob_context = count_context / count_all

#prob target and context
pmi_target_context_matrix = target_context_matrix / count_all

#divide by prob target
pmi_target_context_matrix = pmi_target_context_matrix / prob_target

#divide by prob context
pmi_target_context_matrix = pmi_target_context_matrix / prob_context

#take log -- this will generate a divide by zero warning because we are taking log of 0
pmi_target_context_matrix = np.log(pmi_target_context_matrix)
#replace all the -inf with large negative numbers
#pmi_target_context_matrix - np.nan_to_num(pmi_target_context_matrix)

CPU times: user 5.38 s, sys: 4.97 s, total: 10.3 s
Wall time: 12.6 s




In [17]:
print pmi_target_context_matrix

[[-inf -inf -inf ..., -inf -inf -inf]
 [-inf -inf -inf ..., -inf -inf -inf]
 [-inf -inf -inf ..., -inf -inf -inf]
 ..., 
 [-inf -inf -inf ..., -inf -inf -inf]
 [-inf -inf -inf ..., -inf -inf -inf]
 [-inf -inf -inf ..., -inf -inf -inf]]


Print PMI for time and year. Should be 0.6495

In [31]:
print target_vocab_no_zeros[6185]
print context_vocab_no_zeros[17914]
print pmi_target_context_matrix[(6185,17914)]

time
year
0.64944562357


### Positive PMI: convert all negative numbers to zero

In [32]:
ppmi_target_context_matrix = np.maximum(pmi_target_context_matrix, 0)
print ppmi_target_context_matrix

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


## Compute similarity

### Weighted jaccard:
$j (word_1, word_2) = \frac{\sum{min(word_1[dim_i], word_2[dim_i]})}{\sum{max(word_1[dim_i], word_2[dim_i])}}$

In [33]:
def get_similarities_for_word(target_context_matrix, word_of_interest, top_k):
    j_sims_list = []
    target_word_to_index = dict(zip(target_vocab_no_zeros, list(range(0, len(target_vocab_no_zeros)))))

    index_word_of_interest = target_word_to_index.get(word_of_interest)
    for target_word in target_vocab_no_zeros:
        if target_word != word_of_interest:
            index_target_word = target_word_to_index.get(target_word)  
            numerator = np.minimum(ppmi_target_context_matrix[index_word_of_interest], ppmi_target_context_matrix[index_target_word]).sum()
            denominator = np.maximum(ppmi_target_context_matrix[index_word_of_interest], ppmi_target_context_matrix[index_target_word]).sum()
            j_sims_list.append((target_word, (numerator/denominator)))
    sorted_sims = sorted(j_sims_list, key=lambda sim: sim[1], reverse=True)
    return sorted_sims[:top_k]

In [36]:
%%time
similarities_dict = {}
words_of_interest = ['car', 'bus', 'hospital', 'hotel', 'gun', 'bomb', 'horse', 'fox', 'table', 'bowl', 'guitar', 'piano']
for word_of_interest in words_of_interest:
    similarities_dict[word_of_interest] = get_similarities_for_word(ppmi_target_context_matrix, word_of_interest, 20)


CPU times: user 14 s, sys: 684 ms, total: 14.7 s
Wall time: 15 s


## Evaluation

Read in BLESS data set (tab-separated)

In [37]:
file_name = '/Users/elisa/Documents/CompLing/compSemantics/HW3/BLESS_part.txt'
with open(file_name, 'rb') as f:
    bless_file = f.readlines()
bless_data = [line.split('\t') for line in bless_file] # concept, class, relation, relatum
positive_pairs = [(data[0].split('-')[0], data[3].split('-')[0]) for data in bless_data if data[2]== "coord" or data[2]=="hyper"]
negative_pairs = [(data[0].split('-')[0], data[3].split('-')[0]) for data in bless_data if data[2]== "mero" or data[2]=="random-n"]

### Accuracy @1 and @5

In [66]:
scores_dict = {}
accuracy_level = 5
for word_of_interest in words_of_interest:
    accuracy_1 = 0
    accuracy_5 = 0
    for i in range(0,accuracy_level):
        if (word_of_interest,similarities_dict[word_of_interest][i][0]) in positive_pairs:
            if i==0:
                accuracy_1 = 1
            accuracy_5 += 1    
    accuracy_5 /= accuracy_level
    scores_dict[word_of_interest] = (accuracy_1, accuracy_5)

In [67]:
scores_array = np.asarray(scores_dict.values())
print "Average scores: ", np.mean(scores_array,axis=0)

Average scores:  [ 0.25        0.28333333]
