# Cache Non-Terminal Counts

In [1]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Rule'])

nonterminal_count = defaultdict(int)
for count, _, nonterminal in df[df['Rule Type'] == 'NONTERMINAL'].values:
    nonterminal_count[nonterminal] = count

# $q_\text{MLE}(X \rightarrow Y_1Y_2 | X) = \frac{\text{count}(X \rightarrow Y_1Y_2)}{\text{count}(X)}$

In [2]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'LHS', 'One', 'Two'])

binary_rules = df[df['Rule Type'] == 'BINARYRULE']

In [3]:
fn = lambda row: row['Count'] / float(nonterminal_count[row['LHS']])
binary_rules['MLE'] = binary_rules.apply(fn, axis=1)
del binary_rules['Rule Type']

binary_rules.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Count,LHS,One,Two,MLE
519,1,VP,VERB,SBAR+VP,0.001245
520,4,VP,VERB,NP+PRON,0.004981
521,1,SBAR+S+VP,PRT,VP+VERB,0.1
522,63,SQ,VERB,VP,0.030274
523,1,SQ,ADVP+ADV,VP+VERB,0.000481


# Cache

In [4]:
from collections import defaultdict

mle = defaultdict(float)

In [5]:
for count, lhs, one, two, maximum_likelihood_estimate in binary_rules.values:
    mle[(lhs, (one, two))] = maximum_likelihood_estimate

In [6]:
assert mle[('VP', ('VERB', 'SBAR+VP'))] == binary_rules.MLE.values[0]
assert mle[('VP', ('VERB', 'NP+PRON'))] == binary_rules.MLE.values[1]
assert mle[('SBAR+S+VP', ('PRT', 'VP+VERB'))] == binary_rules.MLE.values[2]
assert mle[('SQ', ('VERB', 'VP'))] == binary_rules.MLE.values[3]
assert mle[('SQ', ('ADVP+ADV', 'VP+VERB'))] == binary_rules.MLE.values[4]

# $q_\text{MLE}(X \rightarrow w) = \frac{\text{count}(X \rightarrow w)}{\text{count}(X)}$

In [7]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Constituent', 'Token'])

unary_rules = df[df['Rule Type'] == 'UNARYRULE']

In [8]:
fn = lambda row: row['Count'] / float(nonterminal_count[row['Constituent']])
unary_rules['MLE'] = unary_rules.apply(fn, axis=1)
del unary_rules['Rule Type']
unary_rules.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Count,Constituent,Token,MLE
62,7,.,_RARE_,0.003445
63,1,PRT,_RARE_,0.003571
64,653,WHNP+PRON,What,0.799266
65,3,NOUN,British,0.000779
66,25,ADP,as,0.024752


# Cache Unary Rules

In [9]:
for count, constituent, token, maximum_likelihood_estimate in unary_rules.values:
    mle[(constituent, token)] = maximum_likelihood_estimate

In [10]:
assert mle[('.', '_RARE_')] == unary_rules.MLE.values[0]
assert mle[('PRT', '_RARE_')] == unary_rules.MLE.values[1]
assert mle[('WHNP+PRON', 'What')] == unary_rules.MLE.values[2]
assert mle[('NOUN', 'British')] == unary_rules.MLE.values[3]
assert mle[('ADP', 'as')] == unary_rules.MLE.values[4]

# Compute Rare Words and All Words

In [11]:
import pandas as pd

df = pd.read_csv('cfg.counts', sep=' ', names=['Count', 'Rule Type', 'Rule', 'Token'])

unaries = df[df['Rule Type'] == 'UNARYRULE']

from collections import defaultdict

counts = defaultdict(int)

for count, token in unaries[['Count', 'Token']].values:
    counts[token] += int(count)
    
rares = set(token for token, count in counts.items() if count < 5)
all_words = set(ant for noterminal, ant in mle if not isinstance(ant, tuple))

# Get All Nonterminals and Binary Rules

In [12]:
nonterminals = nonterminal_count.keys()
binaries = [(lhs, (one, two)) for count, lhs, one, two, maximum_likelihood_estimate in binary_rules.values ]

# CKY Sketch

In [25]:
sentence = 'called ?'

# Split the sentence and map rare words to _RARE_
sentence = ['_RARE_' if token not in all_words or token in rares else token for token in sentence.split()]

from collections import defaultdict

highest_prob, backpointer = defaultdict(float), defaultdict(lambda: None)

# Base case
print 'Base case initialization...'
print
for i, token in enumerate(sentence):
    for constituent in nonterminals:
        if mle[(constituent, token)]:
            print 'pi(i={}, i={}, X={}) = mle(X={} -> w={}) = {}'.format(i, i, constituent, constituent, token, mle[(constituent, token)])
        highest_prob[(i, i, constituent)] = mle[(constituent, token)]

print
print
print 'Recursive steps...'
print
for gap in range(1, len(sentence)):
    print 'gap = {}'.format(gap)
    for i, token in enumerate(sentence[:-gap]):
        print '    i = {}'.format(i)
        j = i + gap
        print '        j = {}'.format(j)
        for split_point in range(i, j):
            print '            split point = {}'.format(split_point)
            for X, (Y1, Y2) in binaries:
                m = mle[(X, (Y1, Y2))] * highest_prob[(i, split_point, Y1)] * highest_prob[(split_point+1, j, Y2)]

                # Non-zero probability?
                if m:          
                    print
                    print '                Computing pi("{} | {}", {} -> {} {})...'.format(' '.join(sentence[i:split_point+1]), ' '.join(sentence[split_point+1:j+1]), X, Y1, Y2)
                    print '                ============================================================'
                    print '                pi(i={}, j={}, X={}) = mle({} -> {} {}) *'.format(i, j, X, X, Y1, Y2)
                    print '                                   pi(i={}, s={}, Y1={}) *'.format(i, split_point, Y1)
                    print '                                   pi(s+1={}, j={}, Y2={})...'.format(split_point+1, j ,Y2)
                    print
                    print '                mle({} -> {} {}) = {}'.format(X, Y1, Y2, mle[(X, (Y1, Y2))])
                    print '                pi(i={}, s={}, Y_1={}) = {}'.format(i, split_point, Y1, highest_prob[(i, split_point, Y1)])
                    print '                pi(s+1={}, j={}, Y_2={}) = {}'.format(split_point+1, j, Y2, highest_prob[(split_point+1, j, Y2)])
                    print
                    print '                Total = {}'.format(m)
                    print
                    print '                Previously pi(i={}, j={}, X={}) = {}'.format(i, j, X, highest_prob[(i, j, X)])
                    print '                Previously bp(i={}, j={}, X={}) = {}'.format(i, j, X, backpointer[(i, j, X)])
                    print
                    
                    if m > highest_prob[(i, j, X)]:
                        highest_prob[(i, j, X)] = m
                        backpointer[(i, j, X)] = ((Y1, Y2), split_point)
                    
                    print '                Now        pi(i={}, j={}, X={}) = {}'.format(i, j, X, highest_prob[(i, j, X)])
                    print '                Now        bp(i={}, j={}, X={}) = {}'.format(i, j, X, backpointer[(i, j, X)])
                    print '                ============================================================'
                    print


def reconstruct(i, j, constituent):
    """"Follow backpointers to reconstruct parse tree"""
    if i == j:
        return [constituent, sentence[i]]
    
    else:
        (y, z), split_point = backpointer[(i, j, constituent)]
        return [constituent, reconstruct(i, split_point, y), reconstruct(split_point+1, j, z)]

reconstruct(0, len(sentence)-1, 'SBARQ')

Base case initialization...

pi(i=0, i=0, X=VP+VERB) = mle(X=VP+VERB -> w=called) = 0.0708661417323
pi(i=0, i=0, X=VERB) = mle(X=VERB -> w=called) = 0.00595998297148
pi(i=0, i=0, X=NP+VERB) = mle(X=NP+VERB -> w=called) = 0.5
pi(i=1, i=1, X=.) = mle(X=. -> w=?) = 0.834153543307


Recursive steps...

gap = 1
    i = 0
        j = 1
            split point = 0

                Computing pi("called | ?", VP -> VERB .)...
                pi(i=0, j=1, X=VP) = mle(VP -> VERB .) *
                                   pi(i=0, s=0, Y1=VERB) *
                                   pi(s+1=1, j=1, Y2=.)...

                mle(VP -> VERB .) = 0.00124533001245
                pi(i=0, s=0, Y_1=VERB) = 0.00595998297148
                pi(s+1=1, j=1, Y_2=.) = 0.834153543307

                Total = 6.19120910798e-06

                Previously pi(i=0, j=1, X=VP) = 0.0
                Previously bp(i=0, j=1, X=VP) = None

                Now        pi(i=0, j=1, X=VP) = 6.19120910798e-06
                Now  

TypeError: 'NoneType' object is not iterable

# CKY

In [21]:
def cky(sentence, mle, rares, all_words):
    """"CKY algorithm for parsing a sentence"""
    
    # Split the sentence and map rare words to _RARE_
    sentence = ['_RARE_' if token not in all_words or token in rares else token for token in sentence.split()]
    
    from collections import defaultdict
    highest_prob, backpointer = defaultdict(float), defaultdict(lambda: None)

    # Base case
    for i, token in enumerate(sentence):
        for constituent in nonterminals:
            highest_prob[(i, i, constituent)] = mle[(constituent, token)]
    
    # Recursion
    for gap in range(1, len(sentence)):
        for i, token in enumerate(sentence[:-gap]):
            j = i + gap
            for split_point in range(i, j):
                for X, (Y1, Y2) in binaries:
                    m = mle[(X, (Y1, Y2))] * highest_prob[(i, split_point, Y1)] * highest_prob[(split_point+1, j, Y2)]
                    # Non-zero probability?
                    if m:          
                        if m > highest_prob[(i, j, X)]:
                            highest_prob[(i, j, X)] = m
                            backpointer[(i, j, X)] = ((Y1, Y2), split_point)

                        
    def reconstruct(i, j, constituent):
        """"Follow backpointers to reconstruct parse tree"""
        if i == j:
            return [constituent, sentence[i]]

        else:
            (y, z), split_point = backpointer[(i, j, constituent)]
            return [constituent, reconstruct(i, split_point, y), reconstruct(split_point+1, j, z)]
        

    return reconstruct(0, len(sentence)-1, 'SBARQ')

# Evaluate on Development Set

In [23]:
with open('parse_dev.dat', 'r') as f:
    questions = [line.strip() for line in f.readlines()]

parses = [cky(question, mle, rares, all_words) for question in questions]

In [None]:
import json

with open('parse_dev.out', 'w') as f:
    for parse in parses:
        json.dump(parse, f)
        f.write('\n')

# Check Yourself

In [None]:
!python eval_parser.py parse_dev.key parse_dev.out