# Cache Non-Terminal Counts

In [41]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Rule'])

nonterminal_count = defaultdict(int)
for count, _, nonterminal in df[df['Rule Type'] == 'NONTERMINAL'].values:
    nonterminal_count[nonterminal] = count

# $q_\text{MLE}(X \rightarrow Y_1Y_2 | X) = \frac{\text{count}(X \rightarrow Y_1Y_2)}{\text{count}(X)}$

In [42]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'LHS', 'One', 'Two'])

binary_rules = df[df['Rule Type'] == 'BINARYRULE']

In [43]:
fn = lambda row: row['Count'] / float(nonterminal_count[row['LHS']])
binary_rules['MLE'] = binary_rules.apply(fn, axis=1)
del binary_rules['Rule Type']

binary_rules.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Count,LHS,One,Two,MLE
519,1,VP,VERB,SBAR+VP,0.001245
520,4,VP,VERB,NP+PRON,0.004981
521,1,SBAR+S+VP,PRT,VP+VERB,0.1
522,63,SQ,VERB,VP,0.030274
523,1,SQ,ADVP+ADV,VP+VERB,0.000481


# Cache

In [21]:
from collections import defaultdict

mle = defaultdict(lambda: defaultdict(int))

for count, lhs, one, two, maximum_likelihood_estimate in binary_rules.values:
    mle[lhs][(one, two)] = maximum_likelihood_estimate

In [44]:
assert mle['VP'][('VERB', 'SBAR+VP')] == binary_rules.MLE.values[0]
assert mle['VP'][('VERB', 'NP+PRON')] == binary_rules.MLE.values[1]
assert mle['SBAR+S+VP'][('PRT', 'VP+VERB')] == binary_rules.MLE.values[2]
assert mle['SQ'][('VERB', 'VP')] == binary_rules.MLE.values[3]
assert mle['SQ'][('ADVP+ADV', 'VP+VERB')] == binary_rules.MLE.values[4]

In [37]:
binary_rules.MLE.values[0]

0.0012453300124533001

# $q_\text{MLE}(X \rightarrow w) = \frac{\text{count}(X \rightarrow w)}{\text{count}(X)}$

In [24]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Constituent', 'Token'])

unary_rules = df[df['Rule Type'] == 'UNARYRULE']

In [25]:
fn = lambda row: row['Count'] / float(nonterminal_count[row['Constituent']])
unary_rules['MLE'] = unary_rules.apply(fn, axis=1)
del unary_rules['Rule Type']
unary_rules

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Count,Constituent,Token,MLE
62,7,.,_RARE_,0.003445
63,1,PRT,_RARE_,0.003571
64,653,WHNP+PRON,What,0.799266
65,3,NOUN,British,0.000779
66,25,ADP,as,0.024752
67,6,NOUN,horse,0.001558
68,1,NP+NOUN,Washington,0.001524
69,7,NOUN,war,0.001817
70,3,ADJ,Great,0.003275
71,19,VERB,get,0.008089


# Cache Unary Rules

In [26]:
for count, constituent, token, maximum_likelihood_estimate in unary_rules.values:
    mle[constituent][token] = maximum_likelihood_estimate

In [11]:
mle

defaultdict(<function <lambda> at 0xb31b144>, {'NP+ADVP': defaultdict(<type 'int'>, {('.', 'ADVP'): 1.0}), 'VP+VERB': defaultdict(<type 'int'>, {'be': 0.003937007874015748, 'play': 0.007874015748031496, 'founded': 0.003937007874015748, 'born': 0.09055118110236221, 'cost': 0.007874015748031496, 'buried': 0.011811023622047244, 'have': 0.027559055118110236, 'fear': 0.003937007874015748, 'open': 0.011811023622047244, 'find': 0.003937007874015748, 'use': 0.007874015748031496, 'built': 0.027559055118110236, 'located': 0.01968503937007874, 'make': 0.003937007874015748, 'write': 0.003937007874015748, 'start': 0.003937007874015748, 'live': 0.011811023622047244, 'do': 0.015748031496062992, 'begin': 0.015748031496062992, 'used': 0.003937007874015748, 'run': 0.007874015748031496, 'form': 0.007874015748031496, 'get': 0.003937007874015748, 'painted': 0.003937007874015748, '_RARE_': 0.41732283464566927, 'invented': 0.031496062992125984, 'killed': 0.003937007874015748, 'died': 0.011811023622047244, 'm

In [None]:
del unary_rules['Rule Type']

In [19]:
unary_rules

Unnamed: 0,Count,Constituent,Token,MLE
62,7,.,_RARE_,0.003445
63,1,PRT,_RARE_,0.003571
64,653,WHNP+PRON,What,0.799266
65,3,NOUN,British,0.000779
66,25,ADP,as,0.024752
67,6,NOUN,horse,0.001558
68,1,NP+NOUN,Washington,0.001524
69,7,NOUN,war,0.001817
70,3,ADJ,Great,0.003275
71,19,VERB,get,0.008089


# CKY

In [10]:
token = 'the'

# Compute $\pi(R, i, i) = $ For Every Word $w$

In [21]:
for count, constituent, token, mle_estimate in unary_rules.values:
    print count, constituent, token, mle_estimate

7 . _RARE_ 0.00344488188976
1 PRT _RARE_ 0.00357142857143
653 WHNP+PRON What 0.799265605875
3 NOUN British 0.000778816199377
25 ADP as 0.0247524752475
6 NOUN horse 0.00155763239875
1 NP+NOUN Washington 0.0015243902439
7 NOUN war 0.00181723779855
3 ADJ Great 0.00327510917031
19 VERB get 0.00808854831843
2 ADV about 0.00819672131148
3 PRON what 0.041095890411
1 NP+VERB called 0.5
1 DET that 0.000553097345133
14 ADJ much 0.0152838427948
18 WHNP+DET that 0.782608695652
5 NOUN house 0.00129802699896
4 ADV how 0.016393442623
6 ADJ different 0.00655021834061
42 NOUN year 0.0109034267913
4 NOUN animal 0.00103842159917
13 VERB have 0.00553426990209
5 ADJ last 0.00545851528384
6 NOUN language 0.00155763239875
1 NP+NOUN school 0.0015243902439
1 NP+NOUN color 0.0015243902439
3 . < 0.00147637795276
12 WHADVP+ADV when 0.0398671096346
31 ADVP+ADV _RARE_ 0.815789473684
5 NP+NOUN Earth 0.00762195121951
1 VERB mean 0.000425713069391
7 NOUN countries 0.00181723779855
6 ADJ high 0.00655021834061
5 NOUN ye