# Cache Non-Terminal Counts

In [270]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Rule'])

nonterminal_count = defaultdict(int)
for count, _, nonterminal in df[df['Rule Type'] == 'NONTERMINAL'].values:
    nonterminal_count[nonterminal] = count

# $q_\text{MLE}(X \rightarrow Y_1Y_2 | X) = \frac{\text{count}(X \rightarrow Y_1Y_2)}{\text{count}(X)}$

In [271]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'LHS', 'One', 'Two'])

binary_rules = df[df['Rule Type'] == 'BINARYRULE']

In [272]:
fn = lambda row: row['Count'] / float(nonterminal_count[row['LHS']])
binary_rules['MLE'] = binary_rules.apply(fn, axis=1)
del binary_rules['Rule Type']

binary_rules.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Count,LHS,One,Two,MLE
519,1,VP,VERB,SBAR+VP,0.001245
520,4,VP,VERB,NP+PRON,0.004981
521,1,SBAR+S+VP,PRT,VP+VERB,0.1
522,63,SQ,VERB,VP,0.030274
523,1,SQ,ADVP+ADV,VP+VERB,0.000481


# Cache

In [273]:
from collections import defaultdict

mle = defaultdict(float)

In [274]:
for count, lhs, one, two, maximum_likelihood_estimate in binary_rules.values:
    mle[(lhs, (one, two))] = maximum_likelihood_estimate

In [275]:
assert mle[('VP', ('VERB', 'SBAR+VP'))] == binary_rules.MLE.values[0]
assert mle[('VP', ('VERB', 'NP+PRON'))] == binary_rules.MLE.values[1]
assert mle[('SBAR+S+VP', ('PRT', 'VP+VERB'))] == binary_rules.MLE.values[2]
assert mle[('SQ', ('VERB', 'VP'))] == binary_rules.MLE.values[3]
assert mle[('SQ', ('ADVP+ADV', 'VP+VERB'))] == binary_rules.MLE.values[4]

# $q_\text{MLE}(X \rightarrow w) = \frac{\text{count}(X \rightarrow w)}{\text{count}(X)}$

In [276]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Constituent', 'Token'])

unary_rules = df[df['Rule Type'] == 'UNARYRULE']

In [277]:
fn = lambda row: row['Count'] / float(nonterminal_count[row['Constituent']])
unary_rules['MLE'] = unary_rules.apply(fn, axis=1)
del unary_rules['Rule Type']
unary_rules.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Count,Constituent,Token,MLE
62,7,.,_RARE_,0.003445
63,1,PRT,_RARE_,0.003571
64,653,WHNP+PRON,What,0.799266
65,3,NOUN,British,0.000779
66,25,ADP,as,0.024752


# Cache Unary Rules

In [278]:
for count, constituent, token, maximum_likelihood_estimate in unary_rules.values:
    mle[(constituent, token)] = maximum_likelihood_estimate

In [279]:
assert mle[('.', '_RARE_')] == unary_rules.MLE.values[0]
assert mle[('PRT', '_RARE_')] == unary_rules.MLE.values[1]
assert mle[('WHNP+PRON', 'What')] == unary_rules.MLE.values[2]
assert mle[('NOUN', 'British')] == unary_rules.MLE.values[3]
assert mle[('ADP', 'as')] == unary_rules.MLE.values[4]

# Compute Rare Words and All Words

In [280]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Rule', 'Token'])

unaries = df[df['Rule Type'] == 'UNARYRULE']

from collections import defaultdict

counts = defaultdict(int)

for count, token in unaries[['Count', 'Token']].values:
    counts[token] += int(count)
    
rares = set(token for token, count in counts.items() if count < 5)
all_words = set(ant for noterminal, ant in mle if not isinstance(ant, tuple))

# Get All Nonterminals and Binary Rules

In [281]:
nonterminals = nonterminal_count.keys()
binaries = [(lhs, (one, two)) for count, lhs, one, two, maximum_likelihood_estimate in binary_rules.values ]

# CKY Sketch

In [285]:
sentence = 'bijeezus ?'

# Split the sentence and map rare words to _RARE_
sentence = ['_RARE_' if token not in all_words or token in rares else token for token in sentence.split()]

from collections import defaultdict

highest_prob, backpointer = defaultdict(float), defaultdict(lambda: None)

# Base case
print 'Base case initialization...'
print
for i, token in enumerate(sentence):
    for constituent in nonterminals:
        if mle[(constituent, token)]:
            print 'pi(i={}, i={}, X={}) = mle(X={} -> w={}) = {}'.format(i, i, constituent, constituent, token, mle[(constituent, token)])
        highest_prob[(i, i, constituent)] = mle[(constituent, token)]

print
print
print 'Recursive steps...'
print
for gap in range(1, len(sentence)):
    print 'gap = {}'.format(gap)
    for i, token in enumerate(sentence[:-gap]):
        print '    i = {}'.format(i)
        j = i + gap
        print '        j = {}'.format(j)
        for split_point in range(i, j):
            print '            split point = {}'.format(split_point)
            for X, (Y1, Y2) in binaries:
                m = mle[(X, (Y1, Y2))] * highest_prob[(i, split_point, Y1)] * highest_prob[(split_point+1, j, Y2)]

                # Non-zero probability?
                if m:          
                    print
                    print '                Computing pi("{} | {}", {} -> {} {})...'.format(' '.join(sentence[i:split_point+1]), ' '.join(sentence[split_point+1:j+1]), X, Y1, Y2)
                    print '                ============================================================'
                    print '                pi(i={}, j={}, X={}) = mle({} -> {} {}) *'.format(i, j, X, X, Y1, Y2)
                    print '                                   pi(i={}, s={}, Y1={}) *'.format(i, split_point, Y1)
                    print '                                   pi(s+1={}, j={}, Y2={})...'.format(split_point+1, j ,Y2)
                    print
                    print '                mle({} -> {} {}) = {}'.format(X, Y1, Y2, mle[(X, (Y1, Y2))])
                    print '                pi(i={}, s={}, Y_1={}) = {}'.format(i, split_point, Y1, highest_prob[(i, split_point, Y1)])
                    print '                pi(s+1={}, j={}, Y_2={}) = {}'.format(split_point+1, j, Y2, highest_prob[(split_point+1, j, Y2)])
                    print
                    print '                Total = {}'.format(m)
                    print
                    print '                Previously pi(i={}, j={}, X={}) = {}'.format(i, j, X, highest_prob[(i, j, X)])
                    print '                Previously bp(i={}, j={}, X={}) = {}'.format(i, j, X, backpointer[(i, j, X)])
                    print
                    
                    if m > highest_prob[(i, j, X)]:
                        highest_prob[(i, j, X)] = m
                        backpointer[(i, j, X)] = ((Y1, Y2), split_point)
                    
                    print '                Now        pi(i={}, j={}, X={}) = {}'.format(i, j, X, highest_prob[(i, j, X)])
                    print '                Now        bp(i={}, j={}, X={}) = {}'.format(i, j, X, backpointer[(i, j, X)])
                    print '                ============================================================'
                    print


def reconstruct(i, j, constituent):
    """"Follow backpointers to reconstruct parse tree"""
    if i == j:
        return [constituent, sentence[i]]
    
    else:
        (y, z), split_point = backpointer[(i, j, constituent)]
        return [constituent, reconstruct(i, split_point, y), reconstruct(split_point+1, j, z)]

reconstruct(0, len(sentence)-1, 'SBARQ')

Base case initialization...

pi(i=0, i=0, X=VP+VERB) = mle(X=VP+VERB -> w=_RARE_) = 0.417322834646
pi(i=0, i=0, X=DET) = mle(X=DET -> w=_RARE_) = 0.00940265486726
pi(i=0, i=0, X=S+VP+VERB) = mle(X=S+VP+VERB -> w=_RARE_) = 1.0
pi(i=0, i=0, X=NP+X) = mle(X=NP+X -> w=_RARE_) = 1.0
pi(i=0, i=0, X=CONJ) = mle(X=CONJ -> w=_RARE_) = 0.0294117647059
pi(i=0, i=0, X=WHADVP+ADV) = mle(X=WHADVP+ADV -> w=_RARE_) = 0.00664451827243
pi(i=0, i=0, X=NOUN) = mle(X=NOUN -> w=_RARE_) = 0.701453790239
pi(i=0, i=0, X=NP+ADJ) = mle(X=NP+ADJ -> w=_RARE_) = 0.75
pi(i=0, i=0, X=PRT+PRT) = mle(X=PRT+PRT -> w=_RARE_) = 0.259259259259
pi(i=0, i=0, X=PRT) = mle(X=PRT -> w=_RARE_) = 0.00357142857143
pi(i=0, i=0, X=.) = mle(X=. -> w=_RARE_) = 0.00344488188976
pi(i=0, i=0, X=PP+NOUN) = mle(X=PP+NOUN -> w=_RARE_) = 1.0
pi(i=0, i=0, X=NUM) = mle(X=NUM -> w=_RARE_) = 0.73786407767
pi(i=0, i=0, X=NP+NUM) = mle(X=NP+NUM -> w=_RARE_) = 0.94
pi(i=0, i=0, X=NP+DET) = mle(X=NP+DET -> w=_RARE_) = 0.04
pi(i=0, i=0, X=ADV) = mle(

['SBARQ', ['.', '_RARE_'], ['.', '?']]

# CKY

In [283]:
def cky(sentence, mle, rares, all_words):
    """"CKY algorithm for parsing a sentence"""
    
    # Split the sentence and map rare words to _RARE_
    sentence = ['_RARE_' if token not in all_words or token in rares else token for token in sentence.split()]
    
    from collections import defaultdict
    highest_prob, backpointer = defaultdict(float), defaultdict(lambda: None)

    # Base case
    for i, token in enumerate(sentence):
        for constituent in nonterminals:
            highest_prob[(i, i, constituent)] = mle[(constituent, token)]
    
    # Recursion
    for gap in range(1, len(sentence)):
        for i, token in enumerate(sentence[:-gap]):
            j = i + gap
            for split_point in range(i, j):
                for X, (Y1, Y2) in binaries:
                    m = mle[(X, (Y1, Y2))] * highest_prob[(i, split_point, Y1)] * highest_prob[(split_point+1, j, Y2)]
                    # Non-zero probability?
                    if m:          
                        if m > highest_prob[(i, j, X)]:
                            highest_prob[(i, j, X)] = m
                            backpointer[(i, j, X)] = ((Y1, Y2), split_point)

                        
    def reconstruct(i, j, constituent):
        """"Follow backpointers to reconstruct parse tree"""
        if i == j:
            return [constituent, sentence[i]]

        else:
            (y, z), split_point = backpointer[(i, j, constituent)]
            return [constituent, reconstruct(i, split_point, y), reconstruct(split_point+1, j, z)]
        

    return reconstruct(0, len(sentence)-1, 'SBARQ')

In [284]:
# cky('the dog bijeezus ?', mle, rares, all_words)