### Import Libraries

In [1]:
from itertools import chain 
import numpy as np
from nltk.corpus import stopwords
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation

from __future__ import division

%matplotlib inline

### Install Watermark - tool to help with reproducibility:

In [None]:
%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark/watermark.py

In [2]:
%load_ext watermark
%watermark -n -t -z -u -m -v -p nltk,numpy,scipy

last updated: Wed Jul 13 2016 23:32:26 CDT

CPython 2.7.11
IPython 4.0.3

nltk 3.0.3
numpy 1.10.1
scipy 0.17.0

compiler   : GCC 4.2.1 (Apple Inc. build 5577)
system     : Darwin
release    : 15.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


### Read the data

In [3]:
def read_data(file_name):
    sents = []
    with open(file_name, 'rb') as f:
        for line in f:
            if line.startswith("<c> "):
                line = line.decode('cp1252') #convert from Windows Latin-1 encoding to avoid unicode issues
                tagged_words = line.split(" ")[1:] #skip the <c>
                sents.append([tagged_word.split('|') for tagged_word in tagged_words])
    return sents

### Clean up the data
#### Remove stop words, punctuation, empty strings, then lowercase and strip punctuation

In [4]:
def clean_sents(sents, tag_index):
    stopwords_set = set(stopwords.words('english'))
    punctuation_set = set(punctuation)
    cleaned_sents = []
    for sent in sents:
        cleaned_sent = []
        for word in sent:
            if word[1].lower() not in stopwords_set and word[1] not in punctuation_set and len(word[1].strip())>0:
                word[tag_index] = word[tag_index].lower().strip().strip(punctuation)
                cleaned_sent.append(word)
        cleaned_sents.append(cleaned_sent)
    return cleaned_sents

### Load and clean the data

In [5]:
def load_and_clean_corpus(tag_index, file_name='wikicorpus.txt'):
    sents = read_data(file_name)
    cleaned_tagged_sents = clean_sents(sents, tag_index)
    return cleaned_tagged_sents

### Look at lemmas of just nouns for our targets

In [6]:
def my_tokenizer(s):
    return s.split()

In [17]:
def get_word_count_index(sents, min_frequency):
    text = [" ".join(sent) for sent in sents]
    vectorizer = CountVectorizer(ngram_range=(1,1), tokenizer=my_tokenizer, #use my own tokenizer so it doesn't split on -
                               token_pattern='(?u)\b\S.*\b',                #use my own token pattern to accept hyphenated words
                               min_df=min_frequency)                        #ignore words that don't occur at least 10 times
    
    vectorizer.fit_transform(text)
    return vectorizer

def get_target_words_index(cleaned_tagged_sents, tag_index, min_frequency_target_words=50):
    target_sents = [[word[tag_index] for word in sent
                  if word[2].startswith('N')] for sent in cleaned_tagged_sents]
    return get_word_count_index(target_sents, min_frequency_target_words)

def remove_tags_and_infrequent_words(cleaned_tagged_sents, tag_index, min_frequency_context_words=20):
    corpus_sents = [[word[tag_index] for word in sent] for sent in cleaned_tagged_sents] 
    vectorizer_corpus = get_word_count_index(corpus_sents, min_frequency_context_words)
    corpus_words = set([corpus_sent for corpus_sent in chain.from_iterable(corpus_sents)])
    words_to_filter = corpus_words.difference(vectorizer_corpus.vocabulary_.keys())
    filtered_corpus_sents = [[word for word in sent if word not in words_to_filter] for sent in corpus_sents]
    return filtered_corpus_sents, vectorizer_corpus

def create_collocation_matrix_with_window(filtered_corpus_sents, vectorizer_corpus, vectorizer_target, window_size=2):
    filtered_vocab_length =  len(vectorizer_corpus.vocabulary_)
    rows = filtered_vocab_length
    cols = filtered_vocab_length
    target_words = set(vectorizer_target.vocabulary_.keys())
    target_context_matrix = lil_matrix((rows, cols), dtype = np.int)
    for context_sent in filtered_corpus_sents:
        for target_index, target_word in enumerate(context_sent):
            if(target_word in target_words):
                #AFTER context
                for i in xrange(1,window_size+1):
                    if(target_index+i) >= len(context_sent):
                        break #reached end of sentence!
                    else:
                        context_word = context_sent[target_index+i]
                        target_context_matrix[vectorizer_corpus.vocabulary_.get(target_word),
                                              vectorizer_corpus.vocabulary_.get(context_word)] += 1
                #BEFORE context
                for i in xrange(1,window_size+1):
                    if(target_index-i) < 0:
                        break #reached end of sentence!
                    else:
                        context_word = context_sent[target_index-i]
                        target_context_matrix[vectorizer_corpus.vocabulary_.get(target_word),
                                              vectorizer_corpus.vocabulary_.get(context_word)] += 1
    return target_context_matrix

def remove_empty_rows_columns(target_context_matrix, vectorizer_corpus):
    print "original shape: ", target_context_matrix.shape
    # remove the rows        
    target_row_sums = target_context_matrix.sum(axis = 1) #sum across the rows
    target_word_index_no_zeros = []
    target_vocab_no_zeros = []
    dropped_target_words = []
    dropped_context_words = []
    original_word_index = vectorizer_corpus.get_feature_names()
    for word_index in range(0,len(original_word_index)):
        if target_row_sums[ word_index ] != 0:
            target_word_index_no_zeros.append(word_index)
            target_vocab_no_zeros.append(original_word_index[word_index])
        else:
            dropped_target_words.append(original_word_index[word_index])
    target_context_matrix_no_zeros = target_context_matrix[target_word_index_no_zeros, :]

    # remove the columns
    context_row_sums = target_context_matrix.sum(axis = 0) #sum down the columns
    context_word_index_no_zeros = []
    context_vocab_no_zeros = []
    for word_index in range(0,len(original_word_index)):
        if context_row_sums[0,word_index] != 0:
            context_word_index_no_zeros.append(word_index)
            context_vocab_no_zeros.append(original_word_index[word_index])
        else:
            dropped_context_words.append(original_word_index[word_index])
    target_context_matrix_no_zeros = target_context_matrix_no_zeros[:, context_word_index_no_zeros]
    
    return target_context_matrix_no_zeros, target_vocab_no_zeros, dropped_target_words, dropped_context_words

In [18]:
def create_collocation_matrix(cleaned_sents, tag_index):
    vectorizer_target = get_target_words_index(cleaned_sents, tag_index)
    filtered_corpus_sents, vectorizer_corpus = remove_tags_and_infrequent_words(cleaned_sents, tag_index)
    target_context_matrix = create_collocation_matrix_with_window(filtered_corpus_sents, vectorizer_corpus, vectorizer_target)
    target_context_matrix, target_word_index, dropped_target_words, dropped_context_words = remove_empty_rows_columns(target_context_matrix, vectorizer_corpus)
    if(len(dropped_target_words) | len(dropped_context_words)):
        print "Dropped ", len(dropped_target_words), " target words"
        print "Dropped ", len(dropped_context_words), " context words"
    return target_context_matrix, target_word_index

### Get collocations
#### stop at sentence boundary
* T x C matrix, where T=target words (rows) and C=context words (columns)  
* use lil_matrix because it is efficient for building sparse matrices  
* Note: We technically don't need to build a full co-occurence matrix because our target words are just the top 50 nouns, but for flexibliity we will build a C x C matrix so that we can easily switch to a different set of target words later.

### Remove rows and columns with zero values (to avoid problems with division later)

### Calculate association measures
#### <font color='red'>Important:</font> need to convert our co-occurence lil matrix to a csr matrix beore we do any mathemtical operations!
1. $ PMI(target,context) = log\frac{P(target,context)}{P(target) P(context)} $  
  * $P(target,context) = \frac{count(target,context)}{count(\_\_,\_\_)}$  
  * $P(target) = \frac{count(target,\_\_)}{count(\_\_,\_\_)}$  
  * $P(context) = \frac{count(context,\_\_)}{count(\_\_,\_\_)}$  

In [19]:
def calculate_ppmi(target_context_matrix):
    target_context_matrix = target_context_matrix.tocsr()
    count_all = target_context_matrix.sum()
    count_target = target_context_matrix.sum(axis = 1) #rows
    count_context = target_context_matrix.sum(axis = 0) #columns

    prob_target = count_target / count_all
    prob_context = count_context / count_all

    #prob target and context
    pmi_target_context_matrix = target_context_matrix / count_all

    #divide by prob target
    pmi_target_context_matrix = pmi_target_context_matrix / prob_target

    #divide by prob context
    pmi_target_context_matrix = pmi_target_context_matrix / prob_context

    #take log -- this will generate a divide by zero warning because we are taking log of 0
    pmi_target_context_matrix = np.log(pmi_target_context_matrix)
    #replace all the -inf with large negative numbers
    #pmi_target_context_matrix - np.nan_to_num(pmi_target_context_matrix)
    
    ppmi_target_context_matrix = np.maximum(pmi_target_context_matrix, 0)
    return ppmi_target_context_matrix

### Positive PMI: convert all negative numbers to zero

## Compute similarity

### Weighted jaccard:
$j (word_1, word_2) = \frac{\sum{min(word_1[dim_i], word_2[dim_i]})}{\sum{max(word_1[dim_i], word_2[dim_i])}}$

In [20]:
def get_similarities_for_word(target_context_matrix, target_word_index, word_of_interest, sim_measure='weighted_jaccard', top_k=20):
    j_sims_list = []
    target_word_to_index = dict(zip(target_word_index, list(range(0, len(target_word_index)))))

    index_word_of_interest = target_word_to_index.get(word_of_interest)
    for target_word in target_word_index:
        if target_word != word_of_interest:
            index_target_word = target_word_to_index.get(target_word)  
            numerator = np.minimum(target_context_matrix[index_word_of_interest], target_context_matrix[index_target_word]).sum()
            denominator = np.maximum(target_context_matrix[index_word_of_interest], target_context_matrix[index_target_word]).sum()
            j_sims_list.append((target_word, (numerator/denominator)))
    sorted_sims = sorted(j_sims_list, key=lambda sim: sim[1], reverse=True)
    return sorted_sims[:top_k]

In [21]:
def get_similarities_for_words(target_context_matrix, words_of_interest, top_k=20):
    word_to_similarities = {}
    for word_of_interest in words_of_interest:
        word_to_similarities[word_of_interest] = get_similarities_for_word(target_context_matrix, word_of_interest, 20)


## Evaluation

Read in BLESS data set (tab-separated)

In [22]:
def get_eval_data(file_name='BLESS_part.txt'):
    with open(file_name, 'rb') as f:
        bless_file = f.readlines()
    bless_data = [line.split('\t') for line in bless_file] # concept, class, relation, relatum
    positive_pairs = [(data[0].split('-')[0], data[3].split('-')[0]) for data in bless_data if data[2]== "coord" or data[2]=="hyper"]
    negative_pairs = [(data[0].split('-')[0], data[3].split('-')[0]) for data in bless_data if data[2]== "mero" or data[2]=="random-n"]
    return positive_pairs, negative_pairs

### Accuracy @1 and @5

In [29]:
def get_scores(words_of_interest, target_context_matrix, target_word_index):
    words_to_similarities = {}
    words_to_scores = {}
    accuracy_level = 5
    
    positive_pairs, _ = get_eval_data()
    
    for word_of_interest in words_of_interest:
        similarities = get_similarities_for_word(target_context_matrix, target_word_index, word_of_interest)
        words_to_similarities[word_of_interest] = similarities
        accuracy_1 = 0
        accuracy_5 = 0
        for i in range(0,accuracy_level):
            if (word_of_interest,similarities[i][0]) in positive_pairs:
                if i==0:
                    accuracy_1 = 1
                accuracy_5 += 1    
        accuracy_5 /= accuracy_level
        words_to_scores[word_of_interest] = (accuracy_1, accuracy_5)
    scores_array = np.asarray(words_to_scores.values())
    print "Average scores: ", np.mean(scores_array,axis=0)
    print words_to_similarities
    return np.mean(scores_array,axis=0), words_to_similarities

### Let's put it all together now

In [24]:
def create_dist_space(assoc_measure='ppmi', tag_index=1): #lemma
    cleaned_sents = load_and_clean_corpus(tag_index)
    target_context_matrix, target_word_index = create_collocation_matrix(cleaned_sents, tag_index)
    target_context_matrix_ppmi = calculate_ppmi(target_context_matrix)
    return target_context_matrix_ppmi, target_word_index
    
def evaluate_similarities(words_of_interest, target_context_matrix, target_word_index):
    words_to_scores,words_to_similarities  = get_scores(words_of_interest, target_context_matrix, target_word_index)
    


### Here we go!

In [30]:
%%time
words_of_interest = ['car', 'bus', 'hospital', 'hotel', 'gun', 'bomb', 'horse', 'fox', 'table', 'bowl', 'guitar', 'piano']
target_context_matrix, target_word_index = create_dist_space()
evaluate_similarities(words_of_interest, target_context_matrix, target_word_index)

Average scores:  [ 0.41666667  0.28333333]
{'horse': [(u'cavalry', 0.089168351009227265), (u'breed', 0.088934040702232681), (u'ride', 0.085833037620169389), (u'dog', 0.084762836465707564), (u'infantry', 0.078155757924123673), (u'cat', 0.075974609160722642), (u'cattle', 0.073298121335489547), (u'guard', 0.071783858820123683), (u'hunting', 0.071090585340690668), (u'camel', 0.070678610212304094), (u'animal', 0.070479364506001813), (u'foot', 0.069525471459616614), (u'soldier', 0.067807764193132961), (u'arm', 0.067557975650351418), (u'mount', 0.065356432752122412), (u'car', 0.064230313858709601), (u'kill', 0.064194697134183867), (u'shoot', 0.064169795500626334), (u'gun', 0.064138470248621834), (u'goat', 0.064000095875275012)], 'bomb': [(u'bomber', 0.11652294813541221), (u'bombing', 0.1152279231539908), (u'missile', 0.096065285876594936), (u'weapon', 0.094589898534941305), (u'aircraft', 0.09456271585578041), (u'rocket', 0.092226166852020411), (u'fighter', 0.081785994036849066), (u'combat', 0