### Import Libraries

In [1]:
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

### Install Watermark - tool to help with reproducibility:

In [None]:
%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark/watermark.py

In [2]:
%load_ext watermark
%watermark -n -t -z -u -m -v -p matplotlib,numpy,conda

last updated: Sat Jun 25 2016 15:43:56 CDT

CPython 2.7.11
IPython 4.0.3

matplotlib 1.5.1
numpy 1.10.1
conda 4.0.8

compiler   : GCC 4.2.1 (Apple Inc. build 5577)
system     : Darwin
release    : 15.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


### Read the data

In [3]:
file_name = '/Users/elisa/Documents/CompLing/compSemantics/HW3/wikicorpus.txt'

with open(file_name, 'rb') as f:
    lines = f.readlines()

In [4]:
%%time
words = []
for line in lines:
    match = re.search(r'^<c> ', line)
    if match:
        line = line.decode('cp1252').encode('utf-8') #convert from unicode to utf8
        words.extend([word_info for word_info in line[match.end(0):].split(" ")])
                      #if word_info.split("|")[0] not in stopwords.words('english') 
                      #and word_info.split("|")[0] != "png" #this word was not parsed correctly by CandC
                      #and word_info.split("|"[0] != ""
                      #and word_info.split("|"[1] != ""]) #if word was actual "|"

CPU times: user 44.7 s, sys: 4.98 s, total: 49.7 s
Wall time: 56.3 s


### Create a data frame

In [5]:
%%time
original_df = pd.DataFrame(words,columns=['word_info'])
df = pd.DataFrame(list(original_df.word_info.str.split("|")))

CPU times: user 1min 13s, sys: 3min 54s, total: 5min 8s
Wall time: 6min 30s


### Clean up the data
#### Drop spurious columns and rename the ones we want to keep

In [6]:
df.drop(df.columns[6:], axis=1, inplace=True)
df.rename(columns={0: 'word', 1: 'lemma', 2:'pos', 3:'chunk', 4:'entity', 5:'ccg'}, inplace=True)
df.head()

Unnamed: 0,word,lemma,pos,chunk,entity,ccg
0,Anarchism,Anarchism,NNP,I-NP,O,N
1,.,.,.,O,O,.\n
2,Anarchism,Anarchism,NNP,I-NP,O,N
3,is,be,VBZ,I-VP,O,(S[dcl]\NP)/NP
4,a,a,DT,I-NP,O,NP[nb]/N


#### Get rid of function words

In [7]:
df = df[~df.word.isin(stopwords.words('english'))]
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6719445 entries, 0 to 10000012
Data columns (total 6 columns):
word      object
lemma     object
pos       object
chunk     object
entity    object
ccg       object
dtypes: object(6)
memory usage: 358.9+ MB


### Look at just nouns for our targets

In [8]:
df_targets = df[df.pos.str.startswith('N')]
df_targets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2938351 entries, 0 to 10000011
Data columns (total 6 columns):
word      object
lemma     object
pos       object
chunk     object
entity    object
ccg       object
dtypes: object(6)
memory usage: 156.9+ MB


### Get the top 50 most frequent noun lemmas
#### How big is our vocabulary?

In [10]:
df_targets.lemma.describe()

count     2938351
unique     171287
top          time
freq        13609
Name: lemma, dtype: object

In [None]:
%%time
v = CountVectorizer(ngram_range=(1,1))
count_features = v.fit_transform(df_targets.lemma[:500000]).toarray()
dist = np.sum(count_features, axis=0)
vocab = v.get_feature_names()
sorted_counts = sorted(zip(vocab, dist), key=lambda count: count[1], reverse=True)
top_50 = sorted_counts[:50]
print top_50