In [93]:
import pandas as pd
import json
import nltk
from nltk.grammar import CFG, Nonterminal, Production
from nltk.parse import EarleyChartParser
# nltk.download('averaged_perceptron_tagger')  
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/glora/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

# Derivative Decomposition of Complex Prepositional Phrases

## Step 1: initialize master lexicon

In [94]:
master_pp_lexicon = {}
# Populate from atomic_p.json: Iterate through every entry in atomic_p.json. For each preposition (e.g., "behind"), add it as a key to master_lexicon and its full feature object as the value.
with open('pp_lexicon/atomic_p.json', 'r') as f:
    atomic_p = json.load(f)
    for preposition, features in atomic_p.items():
        master_pp_lexicon[preposition] = features

In [95]:
# Populate from p_lexicon.json: Iterate through every entry in p_lexicon.json.
# for each key, value pair if the key already exists in master_lexicon, overwrite the value with the new value from p_lexicon.json.
with open('pp_lexicon/p_lexicon.json', 'r') as f:
    p_lexicon = json.load(f)
    for preposition, features in p_lexicon.items():
        if preposition in master_pp_lexicon:
            master_pp_lexicon[preposition] = features
        else:
            master_pp_lexicon[preposition] = features

In [96]:
len(master_pp_lexicon)

79

In [97]:
master_pp_lexicon.keys()

dict_keys(['behind', 'above', 'below', 'beyond', 'in front of', 'inside', 'outside', 'left', 'right', 'against', 'among', 'beside', 'between', 'near', 'next to', 'upon', 'across', 'along', 'around', 'over', 'past', 'through', 'under', 'up', 'down', 'on', 'off', 'in', 'out', 'away', 'top', 'astride', 'corner', 'by', 'edge', 'after', 'but', 'end', 'for', 'base', 'higher', 'high', 'bottom', 'via', 'back', 'at', 'astern', 'following', 'center', 'prior', 'front', 'before', 'opposite', 'from', 'beneath', 'with', 'rear', 'apart', 'next', 'side', 'of', 'far', 'amid', 'skin', 'flank', 'part', 'to', 'board', 'surface', 'a', 'amidst', 'middle', 'rim', 'face', 'close', 'core', 'foot', 'subsequent', 'heart'])

## Step 2: add complex preposition list

In [98]:
df_pp_wordnet = pd.read_csv('dictionaries/pp_wordnet_dict_wiki_pop_fix.csv', sep=',')
pp_wordnet_wiki = list(df_pp_wordnet['preposition'])
# # drop nan in pp_wordnet_wiki
pp_wordnet_wiki = [x for x in pp_wordnet_wiki if x is not None]


In [99]:
complex_pp_list = []
# check pp_wordnet_wiki against master_pp_lexicon keys
for preposition in pp_wordnet_wiki:
    if preposition not in master_pp_lexicon:
        complex_pp_list.append(preposition)

# remove nan type() float in complex_pp_list
complex_pp_list = [x for x in complex_pp_list if type(x) is str]
complex_pp_list = list(set(complex_pp_list))  # remove duplicates
complex_pp_list.sort()
len(complex_pp_list)

110

In [100]:
# add empty p_lexicon.json entry to complex_pp_list
counter = 0
for key, entry in p_lexicon.items():
    if entry["isAtomicMorph"] == "" and entry["class"] == "" and entry["path_p_morphology"] == "" and entry["measure_allowed"] == "" and entry["spellOutHEAD"] == [""]:
        complex_pp_list.append(key)
        counter += 1

print(counter)

0


In [101]:
with open('pp_lexicon/complex_pp_repop.json', 'w') as f:
    json.dump(complex_pp_list, f, indent=4)
    
len(complex_pp_list)

110

In [102]:
if 'the' in complex_pp_list:
    complex_pp_list.remove('the')
complex_pp_list[-10:]

['until',
 'up to',
 'upside',
 'versus',
 'with a view to',
 'with regard to',
 'with respect to',
 'within',
 'without',
 'worth']

In [103]:
decomposed_result = {} # dictionary to store deomposition results

In [104]:
with open('pp_lexicon/axial_words.json', 'r') as f:
    axial_words = json.load(f)

for ax in axial_words:
    if ax not in master_pp_lexicon.keys():
        print(ax)

underside
cross


### get empty list lexicon in p_lexicon.json

In [105]:
# get empty or "" entries from p_lexicon.json
empty_entries = []
for key, entry in p_lexicon.items():
    if entry["isAtomicMorph"] == "" and entry["class"] == "" and entry["path_p_morphology"] == "" and entry["measure_allowed"] == "" and entry["spellOutHEAD"] == [""]:
        empty_entries.append(key)
    
print(len(empty_entries))

0


In [106]:
# copy p_lexicon.json to dictionary
filled_p_lexicon = {}

for key, entry in p_lexicon.items():
    if entry["isAtomicMorph"] == "" and entry["class"] == "" and entry["path_p_morphology"] == "" and entry["measure_allowed"] == "" and entry["spellOutHEAD"] == [""]:
        continue
    else:
        filled_p_lexicon[key] = entry
        
len(filled_p_lexicon.keys())

# with open('pp_lexicon/p_lexicon.json', 'w') as f:
#     json.dump(filled_p_lexicon, f, indent=4)


49

## Step 3: Decomposition Logic

In [107]:
from abstract_domain_classes import PPFactory

In [108]:
PATH={'to','from','into','onto','through','across','toward','past'}
PLACE={'in','on','at','under','beside','near','between','among'}
factory = PPFactory('pp_lexicon/atomic_p.json', 'pp_lexicon/p_lexicon.json', 'pp_lexicon/complex_pp_repop.json', PATH, PLACE)
factory.create_classes()
# Export the augmented lexicon
decomposed_result = factory.export_complex_pp()
decomposed_result


{'aboard': {'isAtomicMorph': False,
  'class': ['BOUNDED'],
  'spellOutHEAD': ['*K',
   {'AxPart': '[SURFACE]'},
   '*Deg',
   '*Proj',
   '*p',
   '*Path'],
  'path_p_morphology': ['LOC'],
  'measure_allowed': False},
 'about': {'isAtomicMorph': False,
  'class': ['NOT_SPATIAL'],
  'spellOutHEAD': [],
  'path_p_morphology': ['none'],
  'measure_allowed': False,
  'unlexicalized': ['bout']},
 'absent': {'isAtomicMorph': False,
  'class': ['NOT_SPATIAL'],
  'spellOutHEAD': [],
  'path_p_morphology': ['none'],
  'measure_allowed': False,
  'unlexicalized': ['bsent']},
 'according to': {'isAtomicMorph': False,
  'class': ['EXTENDED'],
  'spellOutHEAD': ['*K', '*AxPart', '*Deg', '*Proj', '*p', {'Path': 'to'}],
  'path_p_morphology': ['GOAL'],
  'measure_allowed': True,
  'unlexicalized': ['ccording']},
 'adjacent': {'isAtomicMorph': False,
  'class': ['NOT_SPATIAL'],
  'spellOutHEAD': [],
  'path_p_morphology': ['none'],
  'measure_allowed': False,
  'unlexicalized': ['djacent']},
 'adjace

In [109]:
unlexicalized_count = 0
for entry in decomposed_result.values():
    unlex = entry.get('unlexicalized')  
    if unlex:                           
        unlexicalized_count += 1

print(f"Number of unlexicalized entries: {unlexicalized_count}")

keys_with_unlex = [
    k
    for k, entry in decomposed_result.items()
    if entry.get("unlexicalized")
]

print(keys_with_unlex)

Number of unlexicalized entries: 65
['about', 'absent', 'according to', 'adjacent', 'adjacent to', 'ahead', 'ahead of', 'as opposed to', 'as per', 'as regards', 'as soon as', 'as well as', 'at least', 'at most', 'at the back of', 'at the behest of', 'at the rear of', 'because of', 'betwixt', 'by means of', 'by virtue of', 'circa', 'concerning', 'despite', 'due to', 'during', 'except', 'except for', 'for the sake of', 'in a higher place', 'in accordance with', 'in addition to', 'in case of', 'in lieu of', 'in place of', 'in point of', 'in spite of', 'including', 'instead of', 'less', 'like', 'minus', 'nearest', 'nigh', 'notwithstanding', 'on account of', 'on behalf of', 'per', 'plus', 'pursuant to', 'rather than', 'regardless of', 'save', 'since', 'such as', 'thanks to', 'to a higher place', 'toward', 'towards', 'until', 'versus', 'with a view to', 'with regard to', 'with respect to', 'worth']


## Step 4: Acquire Spatial Sentences Dataset

### Refclef dataset

In [110]:
df_refer = pd.read_csv('dataset/refer/referitdataset/refclef_unc_flattened.csv')
df_refer

Unnamed: 0,sent_ids_list,ann_id,ref_id,image_id,split,category_id,tokens,raw,sent_id,sent
0,0,19135_1,0,19135,train,60,sky,sky,0,sky
1,1,19135_2,1,19135,train,235,statue,statue,1,statue
2,2,23412_4,2,23412,train,258,"anywhere,except,the,people",anywhere except the people,2,anywhere except the people
3,3,23412_1,3,23412,train,160,"person,in,front",person in front,3,person in front
4,4,23412_2,4,23412,train,120,"person,all,the,way,in,back",person all the way in back,4,person all the way in back
...,...,...,...,...,...,...,...,...,...,...
130359,130358130359,7380_2,99291,7380,train,96,"two,birds,on,left",two birds on left,130359,two birds on left
130360,130360,7380_5,99292,7380,train,182,"palm,tree,to,the,right",palm tree to the right,130360,palm tree to the right
130361,130361,7380_4,99293,7380,train,182,"tree,on,left,side",tree on left side,130361,tree on left side
130362,130362,38047_4,99294,38047,train,34,"bush,bottom,left",bush bottom left,130362,bush bottom left


In [111]:
df_refer['sent']

0                                sky
1                             statue
2         anywhere except the people
3                    person in front
4         person all the way in back
                     ...            
130359             two birds on left
130360        palm tree to the right
130361             tree on left side
130362              bush bottom left
130363                           sky
Name: sent, Length: 130364, dtype: object

#### Inspect most frequent tokens

In [112]:
import nltk
from nltk.corpus import brown

# ensure Brown corpus and the universal tagset are available
nltk.download('brown', quiet=True)
nltk.download('universal_tagset', quiet=True)

# build a set of lower‐cased nouns from the Brown corpus
noun_set = {w.lower() for w, t in brown.tagged_words(tagset='universal') if t == 'NOUN'}

# explode your tokens, drop missing, lowercase them
tokens = df_refer['tokens'].str.split().explode().dropna().str.lower()

# keep only those tokens that are NOT in the Brown corpus noun set
filtered_tokens = tokens[~tokens.isin(noun_set)]

from collections import Counter
most_common_tokens = Counter(filtered_tokens).most_common(20)
print((most_common_tokens))

[('bottom,right,corner', 476), ('bottom,left,corner', 446), ('bottom,right', 413), ('bottom,left', 399), ('the,sky', 386), ('top,left,corner', 347), ('top,right,corner', 338), ('top,right', 291), ('top,left', 271), ('blue,sky', 255), ('bike', 233), ('any,person', 217), ('tree,on,left', 162), ('tree,on,right', 155), ('top,sky', 147), ('tree,left', 138), ('anywhere', 127), ('left,building', 127), ('tree,right', 119), ('sky,top,left', 118)]


In [113]:
most_common_tokens

[('bottom,right,corner', 476),
 ('bottom,left,corner', 446),
 ('bottom,right', 413),
 ('bottom,left', 399),
 ('the,sky', 386),
 ('top,left,corner', 347),
 ('top,right,corner', 338),
 ('top,right', 291),
 ('top,left', 271),
 ('blue,sky', 255),
 ('bike', 233),
 ('any,person', 217),
 ('tree,on,left', 162),
 ('tree,on,right', 155),
 ('top,sky', 147),
 ('tree,left', 138),
 ('anywhere', 127),
 ('left,building', 127),
 ('tree,right', 119),
 ('sky,top,left', 118)]

### Random Sampling for RefClef dataset

In [114]:
# filter df_refer to only sentences longer than 2 tokens
mask = df_refer['sent'].str.split().str.len() > 4
df_refer_filtered = df_refer.loc[mask]
print(f"Sentences longer than 2 tokens: {len(df_refer_filtered)}")

# now sample from the filtered dataframe
# replace n and random_state with your desired values
sampled_df_refer = df_refer_filtered.sample(n=873, random_state=42)
sampled_df_refer


Sentences longer than 2 tokens: 34533


Unnamed: 0,sent_ids_list,ann_id,ref_id,image_id,split,category_id,tokens,raw,sent_id,sent
111206,111206,37864_6,84758,37864,train,160,"person,behind,banner,on,the,right,near,guy,holding,banner,on,right",person behind banner on the right near guy holding banner on right,111206,person behind banner on the right near guy holding banner on right
12482,1248212483,12335_2,9234,12335,train,191,"plant,to,the,left,of,the,head",Plant to the left of the head,12482,plant to the left of the head
104404,104401104402104403104404104405,7020_5,79423,7020,train,88,"5th,man,from,bottom,,,green,cap,face,only","5th man from bottom, green cap face only",104404,"5th man from bottom , green cap face only"
9308,93089309,35840_3,7010,35840,train,31,"tallest,building,in,the,background",tallest building in the background,9308,tallest building in the background
27705,27705,24239_4,20297,24239,train,120,"the,shape,their,bodies,are,forming",the shape their bodies are forming,27705,the shape their bodies are forming
...,...,...,...,...,...,...,...,...,...,...
19324,19324,18518_4,14177,18518,train,119,"old,machine,left,-,big,wheel",old machine left - big wheel,19324,old machine left - big wheel
103721,103721,7003_6,78843,7003,train,204,"dark,gray,rocks,above,lower,green,and,white,stuff",dark gray rocks above lower green and white stuff,103721,dark gray rocks above lower green and white stuff
92840,92840,18280_4,70122,18280,train,110,"left,side,behind,girl,in,black",left side behind girl in black,92840,left side behind girl in black
69787,6978669787,1411_5,52419,1411,train,271,"green,glowing,corner,window,second,floor",green glowing corner window second floor,69787,green glowing corner window second floor


In [115]:
refer_sent = sampled_df_refer['sent'].tolist()
refer_sent[:10]


['person behind banner on the right near guy holding banner on right',
 'plant to the left of the head',
 '5th man from bottom , green cap face only',
 'tallest building in the background',
 'the shape their bodies are forming',
 'top left dark part of sky',
 'green beer can middle , left of tall brown bottle',
 'the candle at very bottom right',
 'the tree at the left',
 'front left girl in blue']

### Acquiring REAL_Corpus Dataset

In [116]:
df_real = pd.read_csv('dataset/REAL_Corpus/REAL_Corpus_ReferringExpressionsData_withValidationDetails.csv', delimiter=';')
df_real.head()

Unnamed: 0,userid,age,gender,photoid,x,y,annotation,status,phrase length,validator_userid,validator_age,validator_gender
0,2,4,male,img23,495,157,Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched windows,correct,123,26,6,male
1,2,4,male,img23,0,0,Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched windows,cantfind,123,41,5,female
2,2,4,male,img23,383,164,Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched windows,correct,123,42,4,female
3,2,4,male,img31,0,0,"Traditional early Victorian terrace on 3 floors + dormer. Third in from the left of the end terrace. Royal Blue Door, Black, spiked iron railings.",ambiguous,147,7,2,female
4,2,4,male,img31,210,421,"Traditional early Victorian terrace on 3 floors + dormer. Third in from the left of the end terrace. Royal Blue Door, Black, spiked iron railings.",correct,147,8,3,female


In [117]:
df_real[df_real['status'] == 'correct'].head()

Unnamed: 0,userid,age,gender,photoid,x,y,annotation,status,phrase length,validator_userid,validator_age,validator_gender
0,2,4,male,img23,495,157,Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched windows,correct,123,26,6,male
2,2,4,male,img23,383,164,Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched windows,correct,123,42,4,female
4,2,4,male,img31,210,421,"Traditional early Victorian terrace on 3 floors + dormer. Third in from the left of the end terrace. Royal Blue Door, Black, spiked iron railings.",correct,147,8,3,female
5,2,4,male,img31,82,371,"Traditional early Victorian terrace on 3 floors + dormer. Third in from the left of the end terrace. Royal Blue Door, Black, spiked iron railings.",correct,147,9,2,male
6,2,4,male,img21,222,305,"Large, modern glass fronted building, butted up against traditional victorian terrace, slightly set back from road, and with facing bowed frontage.",correct,147,6,2,male


In [118]:
# keep the first row for each unique annotation,
# but carry along every other column
df_real_unique = df_real.drop_duplicates(subset=['annotation'], keep='first') \
                        .reset_index(drop=True)
df_real_unique.iloc[0]

userid                                                                                                                                        2
age                                                                                                                                           4
gender                                                                                                                                     male
photoid                                                                                                                                   img23
x                                                                                                                                           495
y                                                                                                                                           157
annotation          Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched 

In [119]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

df_real_unique['annotation'].head(20)

0                             Pub called 'Deacon Brodie's Tavern. Black and white traditional building with sign hanging outside and 3 big arched windows
1     Traditional early Victorian terrace on 3 floors + dormer. Third in from the left of the end terrace.  Royal Blue Door, Black, spiked iron railings.
2     Large, modern glass fronted building, butted up against traditional victorian terrace, slightly set back from road, and with facing bowed frontage.
3                           Two story traditional stone building, on the left of the street as faced, just before the bed.  Two large white double doors.
4                  Church in the middle of a small square/island - no spire but with large stained glass window in gable end, slightly obscured by trees.
5                                                                      Building with columns behind the one in front, also in similar architecture style.
6                                                                   End cent

In [120]:
df_real_unique.shape

(878, 12)

There's multiple data containing multiple sentences. We should split those.

In [121]:
# split the annotation column into lists, explode to one row per sentence
df_real_split = (
    df_real_unique
      .assign(annotation=df_real_unique['annotation'].str.split(r'\.'))
      .explode('annotation')
      .reset_index(drop=True)
)

# strip whitespace and drop any empty strings
df_real_split['annotation'] = df_real_split['annotation'].str.strip()
df_real_split = df_real_split[df_real_split['annotation'] != '']

df_real_split.shape

(1245, 12)

Should I sample refclef with 1245 sentences as well?

In [122]:
df_real_split['annotation'].tail(20)

1690                                                                                                                   Large cream columned building
1691                                                                                                                                    Small Church
1692                                                                                                                  Festival Theatre made of glass
1693                                        A museum that looks like an ancient Greek temple with columns to two sides, except still in a good state
1694                                                                                                     There are some classic statues on top of it
1696                                                                            It's a restaurant and possibly pub called Deacon Brodie's Tavern, no
1697                                                                                                      

In [123]:
df_real_split.to_csv

<bound method NDFrame.to_csv of       userid  age  gender photoid    x    y                                                                                                                                    annotation     status  phrase length  validator_userid  validator_age validator_gender
0          2    4    male   img23  495  157                                                                                                            Pub called 'Deacon Brodie's Tavern    correct            123                26              6             male
1          2    4    male   img23  495  157                                                       Black and white traditional building with sign hanging outside and 3 big arched windows    correct            123                26              6             male
2          2    4    male   img31    0    0                                                                                      Traditional early Victorian terrace on 3 floors + dor

In [124]:
# try:
#     nltk.data.find('taggers/averaged_perceptron_tagger_eng')
#     print("‘averaged_perceptron_tagger_eng’ is installed ")
# except LookupError:
#     print("‘averaged_perceptron_tagger’ is NOT installed  (LookupError raised)")


In [125]:
try:
    nltk.pos_tag("This is a test".split())
    print("Tagger works!")
except LookupError as e:
    print("LookupError:", e)
    print("So the tagger isn’t installed yet.")


Tagger works!


In [126]:
# nltk.pos_tag() uses the Penn Treebank Tag Set.
unique_sents = df_real_split['annotation'].unique()

In [139]:
# generate cfg rule dump constituent tree as grammar:

S = Nonterminal('S')
prod_set = set()

for sent in unique_sents:
    tokens = sent.split()

    tagged = nltk.pos_tag(tokens)  
    # build the S → POS1 POS2 … production
    rhs_pos = [Nonterminal(pos) for (_tok, pos) in tagged]
    prod_set.add(Production(S, rhs_pos))

    # build each POS → "word" production
    for word, pos in tagged:
        prod_set.add(Production(Nonterminal(pos), [word]))


grammar = CFG(S, list(prod_set))
# print(grammar)
parser = EarleyChartParser(grammar)

# start_sym = trees[0].label()           # usually 'S'

# prods = list({ p for p in all_prods })  # set-dedupe, original producing 2000 cfg rules
# grammar = CFG(Nonterminal(start_sym), prods)


# test on the first sentence
test_sent = unique_sents_unpunct[1].split()
for tree in parser.parse(test_sent):
    print(tree)

(S
  (NNP Black)
  (CC and)
  (JJ white)
  (JJ traditional)
  (NN building)
  (IN with)
  (NN sign)
  (VBG hanging)
  (JJ outside)
  (CC and)
  (CD 3)
  (JJ big)
  (VBD arched)
  (NNS windows))


In [128]:
# S = Nonterminal('S')
# prods = []

# for sent in unique_sents:
#     tagged = nltk.pos_tag(sent.split())
#     prev_nt = S

#     # for each token except the last, build NT_i → POS_i NT_{i+1}
#     for i, (w, pos) in enumerate(tagged[:-1]):
#         curr_nt = Nonterminal(f"X{i}")
#         prods.append(Production(prev_nt, [Nonterminal(pos), curr_nt]))
#         prods.append(Production(Nonterminal(pos), [w]))
#         prev_nt = curr_nt

#     # final link: X_{n-1} → POS_n
#     last_word, last_pos = tagged[-1]
#     prods.append(Production(prev_nt, [Nonterminal(last_pos)]))
#     prods.append(Production(Nonterminal(last_pos), [last_word]))

# grammar = CFG(S, prods)
# parser = EarleyChartParser(grammar)

# print(grammar)


### spaCy+benepar PCFG Neural Parser

In [129]:
from nltk import Tree

In [130]:
# pip install spacy benepar torch
import spacy, benepar

nlp = spacy.load("en_core_web_sm")
# download (or wrap in try/except to skip if already present)
# benepar.download("benepar_en3")
nlp.add_pipe("benepar", config={"model": "benepar_en3"})


  state_dict = torch.load(


<benepar.integrations.spacy_plugin.BeneparComponent at 0x1445a2df0>

In [138]:
punct_tags = {",", ":", "``", "''", ".", "(", ")", "--", "``", "+"}

unique_sents_unpunct = [
    sent for sent in unique_sents if not any(tag in sent for tag in punct_tags) # reduced 50% rules
]

In [140]:
# 2. collect all the parse trees
trees = []
for doc in nlp.pipe(unique_sents_unpunct):
    for sent in doc.sents:
        bracketed = sent._.parse_string     # "(S (NP …) (VP …))"
        tree = Tree.fromstring(bracketed)
        # print(tree)
        trees.append(tree)


# 3. extract productions
all_prods = []
for t in trees:
    all_prods += t.productions()

# 4. build a grammar (dedupe productions)
start_sym = trees[0].label()           # usually 'S'

prods = list({ p for p in all_prods })  # set-dedupe, original producing 2000 cfg rules
grammar = CFG(Nonterminal(start_sym), prods)

all_prods



[NP -> NP VP,
 NP -> NN,
 NN -> 'Pub',
 VP -> VBN S,
 VBN -> 'called',
 S -> `` NP,
 `` -> "'",
 NP -> NP NNP,
 NP -> NNP NNP POS,
 NNP -> 'Deacon',
 NNP -> 'Brodie',
 POS -> "'s",
 NNP -> 'Tavern',
 NP -> NP PP,
 NP -> ADJP JJ NN,
 ADJP -> JJ CC JJ,
 JJ -> 'Black',
 CC -> 'and',
 JJ -> 'white',
 JJ -> 'traditional',
 NN -> 'building',
 PP -> IN NP,
 IN -> 'with',
 NP -> NP CC NP,
 NP -> NP VP,
 NP -> NN,
 NN -> 'sign',
 VP -> VBG ADVP,
 VBG -> 'hanging',
 ADVP -> RB,
 RB -> 'outside',
 CC -> 'and',
 NP -> CD JJ JJ NNS,
 CD -> '3',
 JJ -> 'big',
 JJ -> 'arched',
 NNS -> 'windows',
 FRAG -> ADJP PP,
 ADJP -> JJ RB,
 JJ -> 'Third',
 RB -> 'in',
 PP -> IN NP,
 IN -> 'from',
 NP -> NP PP,
 NP -> DT NN,
 DT -> 'the',
 NN -> 'left',
 PP -> IN NP,
 IN -> 'of',
 NP -> DT NN NN,
 DT -> 'the',
 NN -> 'end',
 NN -> 'terrace',
 NP -> CD JJ JJ JJ NNS,
 CD -> 'Two',
 JJ -> 'large',
 JJ -> 'white',
 JJ -> 'double',
 NNS -> 'doors',
 NP -> NP PP,
 NP -> JJ JJ NN,
 JJ -> 'Big',
 JJ -> 'historical',
 NN

In [133]:
len(all_prods)

15429

In [137]:
# test on the first sentence
test_sent = trees[0].leaves()

for t in parser.parse(test_sent):
# #     # t.pretty_print()
    print(t)

ValueError: Grammar does not cover some of the input words: '"\'", \'Brodie\', "\'s"'.

In [None]:
# 4. build a grammar (dedupe productions)
start_sym = trees[0].label()           

prods = list({ p for p in all_prods })  # set-dedupe, original producing 2000 cfg rules
grammar = CFG(Nonterminal(start_sym), prods)

punct_tags = {",", ":", "``", "''", ".", "(", ")", "--"}
filtered_prods = [
  p for p in all_prods
  if not any(
    isinstance(sym, Nonterminal) and sym.symbol() in punct_tags
    for sym in p.rhs()
  )
]
# grammar = CFG(Nonterminal(start_sym), filtered_prods) # producing 26k++ rules
print(len(filtered_prods))

25698


In [None]:
# punct_tags = {",", ":", "``", "''", ".", "(", ")", "--"}
# filtered_prods = [
#   p for p in all_prods
#   if not any(
#     isinstance(sym, Nonterminal) and sym.symbol() in punct_tags
#     for sym in p.rhs()
#   )
# ]
# grammar = CFG(Nonterminal(start_sym), filtered_prods) # producing 25k+ rules
# # before you extract productions, do for each tree:
# for t in trees:
#     # this will break any rule X → A B C D … into a chain of binaries
#     Tree.chomsky_normal_form(t, horzMarkov=2)

# # extract production to keep RHS at length 2 or 1

# from collections import Counter

# counts = Counter(all_prods)
# keep = [p for p in all_prods if counts[p] > 1]   # e.g. freq > 1
# grammar = CFG(Nonterminal(start_sym), list({p for p in keep}))


In [None]:
print(grammar)

Grammar with 1254 productions (start state = NP)
    VP -> MD PP
    VBN -> 'named'
    NN -> 'blind'
    NNS -> 'pillars'
    NN -> 'archway'
    NN -> 'person'
    NN -> 'postbox'
    ADJP -> CD HYPH VBN
    JJ -> 'beautiful'
    NNS -> 'tables'
    NP -> NP , ADJP
    NN -> 'clock'
    NP -> DT NNP
    NN -> 'day'
    NN -> 'image'
    NP -> DT JJ , JJ NN
    IN -> 'in'
    NN -> 'style'
    VP -> VBZ ADVP VP
    RB -> 'directly'
    JJ -> 'opposite'
    VBN -> 'passed'
    JJ -> 'vertical'
    NN -> 'design'
    IN -> 'up'
    VP -> VBP VP
    VB -> 'look'
    NN -> 'outside'
    NN -> 'tavern'
    S -> NP
    NN -> 'colour'
    NNP -> 'Castro'
    NN -> 'pyramid'
    NP -> DT NML JJ NN
    JJ -> 'outside'
    IN -> 'from'
    S -> CC NP VP
    NNP -> 'Carlton'
    NP -> JJ PP
    NN -> 'part'
    JJ -> 'pretty'
    ADVP -> RBS RB
    NP -> NNS NN
    VBN -> 'nestled'
    RBS -> 'least'
    : -> '-'
    NP -> DT JJ , NML NN
    NNP -> 'Church'
    JJ -> 'twin'
    DT -> 'no'
    DT

In [None]:
test_tokens = trees[2].leaves()
print(test_tokens)
# for t in parser.parse(test_tokens):
# #     # t.pretty_print()
#     print(t)

['Traditional', 'early', 'Victorian', 'terrace', 'on', '3', 'floors', '+', 'dormer']
