### Import Libraries

In [1]:
from itertools import chain 
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
from nltk.collocations import *
import pandas as pd
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation

%matplotlib inline

### Install Watermark - tool to help with reproducibility:

In [None]:
%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark/watermark.py

In [2]:
%load_ext watermark
%watermark -n -t -z -u -m -v -p matplotlib,numpy,conda,pandas,nltk

last updated: Mon Jun 27 2016 16:08:30 CDT

CPython 2.7.11
IPython 4.0.3

matplotlib 1.5.1
numpy 1.10.1
conda 4.0.8
pandas 0.17.1
nltk 3.0.3

compiler   : GCC 4.2.1 (Apple Inc. build 5577)
system     : Darwin
release    : 15.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


### Read the data

In [3]:
file_name = '/Users/elisa/Documents/CompLing/compSemantics/HW3/wikicorpus.txt'

with open(file_name, 'rb') as f:
    lines = f.readlines()

In [4]:
%%time
sents = []
for line in lines:
    match = re.search(r'^<c> ', line)
    if match:
        line = line.decode('cp1252').encode('utf-8') #convert from unicode to utf8
        tagged_words = [word_info for word_info in line[match.end(0):].split(" ")]
        sents.append([tagged_word.split('|') for tagged_word in tagged_words])
print sents[:3]

[[['Anarchism', 'Anarchism', 'NNP', 'I-NP', 'O', 'N'], ['.', '.', '.', 'O', 'O', '.\n']], [['Anarchism', 'Anarchism', 'NNP', 'I-NP', 'O', 'N'], ['is', 'be', 'VBZ', 'I-VP', 'O', '(S[dcl]\\NP)/NP'], ['a', 'a', 'DT', 'I-NP', 'O', 'NP[nb]/N'], ['political', 'political', 'JJ', 'I-NP', 'O', 'N/N'], ['philosophy', 'philosophy', 'NN', 'I-NP', 'O', 'N'], ['encompassing', 'encompass', 'VBG', 'I-VP', 'O', '(S[ng]\\NP)/NP'], ['theories', 'theory', 'NNS', 'I-NP', 'O', 'N'], ['and', 'and', 'CC', 'I-NP', 'O', 'conj'], ['attitudes', 'attitude', 'NNS', 'I-NP', 'O', 'N'], ['which', 'which', 'WDT', 'B-NP', 'O', '(NP\\NP)/(S[dcl]\\NP)'], ['consider', 'consider', 'VBP', 'I-VP', 'O', '((S[dcl]\\NP)/(S[to]\\NP))/NP'], ['the', 'the', 'DT', 'I-NP', 'O', 'NP[nb]/N'], ['state', 'state', 'NN', 'I-NP', 'O', 'N'], ['to', 'to', 'TO', 'I-VP', 'O', '(S[to]\\NP)/(S[b]\\NP)'], ['be', 'be', 'VB', 'I-VP', 'O', '(S[b]\\NP)/(S[adj]\\NP)'], ['unnecessary', 'unnecessary', 'JJ', 'I-ADJP', 'O', 'S[adj]\\NP'], [',', ',', ',', 'I

### Clean up the data
#### Lower case, remove stop words, punctuation

In [5]:
%%time
cleaned_sents = [[word for word in sent 
                  if word[0].lower() not in stopwords.words('english')
                  and word[0] not in punctuation
                 ] for sent in sents] 
cleaned_sents[:3]

CPU times: user 15min 17s, sys: 2min 24s, total: 17min 42s
Wall time: 17min 47s




### Look at lemmas of just nouns for our targets

In [6]:
tag_index = 1 #lemma
target_sents = [[word[tag_index].lower() for word in sent 
                  if word[2].startswith('N')
                 ] for sent in cleaned_sents] 
target_sents[:3]

[['anarchism'],
 ['anarchism', 'philosophy', 'theory', 'attitude', 'state'],
 ['specific', 'anarchist', 'criterion', 'anarchism', 'criterion']]

### Get the top 50 most frequent noun lemmas

In [7]:
target_text = [" ".join(target_sent) for target_sent in target_sents]

In [8]:
%%time
v_target = CountVectorizer(ngram_range=(1,1))
unigram_matrix = v_target.fit_transform(target_text)
#sort
features_count = unigram_matrix.sum(axis=0).tolist()[0]
features_names = v_target.get_feature_names()
sorted_counts = sorted(zip(features_names, features_count), key=lambda count: count[1], reverse=True)
top_50 = sorted_counts[:50]
print top_50

[(u'time', 14192L), (u'year', 13251L), (u'system', 9949L), (u'city', 9743L), (u'number', 9571L), (u'world', 9144L), (u'state', 8573L), (u'part', 7927L), (u'example', 7233L), (u'century', 7180L), (u'war', 7070L), (u'name', 6858L), (u'people', 6851L), (u'group', 6796L), (u'country', 6772L), (u'area', 6441L), (u'language', 6406L), (u'work', 6292L), (u'united', 6174L), (u'use', 6128L), (u'government', 6094L), (u'game', 6077L), (u'term', 5822L), (u'form', 5736L), (u'book', 5511L), (u'life', 5452L), (u'church', 5433L), (u'day', 5106L), (u'member', 4991L), (u'case', 4882L), (u'new', 4816L), (u'film', 4755L), (u'word', 4750L), (u'states', 4693L), (u'law', 4690L), (u'force', 4689L), (u'history', 4623L), (u'power', 4615L), (u'man', 4503L), (u'order', 4459L), (u'point', 4458L), (u'school', 4425L), (u'way', 4360L), (u'series', 4154L), (u'death', 4130L), (u'line', 4097L), (u'population', 4097L), (u'team', 4018L), (u'text', 3964L), (u'end', 3944L)]
CPU times: user 6.48 s, sys: 270 ms, total: 6.75 s


#### How big is our  vocabulary?

In [9]:
target_words = set([target_sent for target_sent in chain.from_iterable(target_sents)])
print len(target_words)

151372


In [10]:
context_sents = [[word[tag_index].lower() for word in sent] for sent in cleaned_sents] 
context_words = set([context_sent for context_sent in chain.from_iterable(context_sents)])
print len(context_words)

217953


### Get collocations
#### stop at sentence boundary

In [36]:
%%time
window_size = 5
bigramFreqDist = FreqDist()
for target_word in top_50:
    #target_word = 'anarchism'
    #print "target word ", target_word[0]
    for context_sent in context_sents:
        #print "context sent ", context_sent
        context_indices = [context_index for context_index, context_word in enumerate(context_sent) if context_word == target_word[0]]
        #print "context indices ", context_indices
        for context_index in context_indices:
            #AFTER
            for i in xrange(1,window_size+1):
                if(context_index+i) >= len(context_sent):
                    #print "end of sentence!"
                    break
                else:
                    context_word = context_sent[context_index+i]
                    #print "context_word", context_word
                    bigramFreqDist[(target_word[0],context_word)] += 1
            #BEFORE
            for i in xrange(1,window_size+1):
                if(context_index-i) < 0:
                    #print "before end of sentence!"
                    break
                else:
                    context_word = context_sent[context_index-i]
                    #print "before context_word", context_word
                    bigramFreqDist[(target_word[0],context_word)] += 1
print bigramFreqDist

<FreqDist with 558210 samples and 2670887 outcomes>
CPU times: user 1min 47s, sys: 1.35 s, total: 1min 48s
Wall time: 1min 51s




### Calculate association measures
1. PMI

In [39]:
%%time
context_text = [" ".join(context_sent) for context_sent in context_sents]
v_context = CountVectorizer(ngram_range=(1,1))
unigram_matrix = v_context.fit_transform(context_text)
#sort
features_count = unigram_matrix.sum(axis=0).tolist()[0]
features_names = v_context.get_feature_names()
sorted_counts = sorted(zip(features_names, features_count), key=lambda count: count[1], reverse=True)
top_50 = sorted_counts[:50]
print top_50

[(u'use', 28535L), (u'also', 24074L), (u'one', 22231L), (u'first', 17037L), (u'time', 14991L), (u'include', 14559L), (u'year', 14148L), (u'two', 14126L), (u'many', 13470L), (u'make', 13016L), (u'new', 12436L), (u'may', 11720L), (u'would', 10833L), (u'become', 10704L), (u'state', 10522L), (u'system', 9967L), (u'number', 9860L), (u'city', 9804L), (u'however', 9794L), (u'name', 9638L), (u'form', 9623L), (u'know', 9535L), (u'work', 9431L), (u'call', 9392L), (u'world', 9312L), (u'take', 8610L), (u'see', 8079L), (u'part', 8061L), (u'give', 7768L), (u'century', 7482L), (u'war', 7327L), (u'well', 7320L), (u'example', 7240L), (u'later', 7204L), (u'often', 7130L), (u'group', 6975L), (u'people', 6869L), (u'country', 6833L), (u'early', 6732L), (u'three', 6726L), (u'since', 6704L), (u'language', 6600L), (u'term', 6534L), (u'write', 6496L), (u'follow', 6483L), (u'find', 6473L), (u'area', 6467L), (u'united', 6254L), (u'game', 6210L), (u'government', 6200L)]
CPU times: user 10.1 s, sys: 601 ms, total:

In [50]:
for(bigram in bigramFreqDist.keys()):
    target_word = bigram[0]
    context_word = bigram[1]
    joint_prob = bigramFreqDist.get(bigram) / features_count[v_context.vocabulary_.get(target_word)]

2


In [60]:
v_context.vocabulary_.get('time')

143632

In [62]:
143632]

14991L