In [1]:
from surface import grammar
from surface import converter
from collections import defaultdict

In [2]:
def make_default_structure(graph_data, word_id):
    if word_id not in graph_data:
        graph_data[word_id] = {
            "word": "",
            "deps": {},
        }

In [3]:
def extract_rules(dev):
    graph_data = {}
    noun_list = []
    id_to_rules = defaultdict(list)
    id_to_sentence = {}
    sentences = 0
    with open(dev, "r") as f:
        for i,line in enumerate(f):            
            if line == "\n":
                words = []
                for w in graph_data:
                    words.append(graph_data[w]["word"])
                    subgraphs = {"root": None, "graph": []}
                    rules = []
                    if "tree_pos" not in graph_data[w]:
                        continue
                    
                    subgraphs["root"] = graph_data[w]["tree_pos"]
                    
                    for dep in graph_data[w]["deps"]:                        
                        edge_dep = graph_data[w]["deps"][dep]
                        to_pos = graph_data[dep]["tree_pos"]
                        mor = graph_data[dep]["mor"]
                            
                        if "tree_pos" in graph_data[w]:
                            if "lin=+" in mor:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep, "dir":"S"})
                            elif "lin=-" in mor:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep, "dir":"B"})
                            else:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep, "dir":None})

                    id_to_rules[sentences].append(subgraphs)
                    id_to_sentence[sentences] = words
                graph_data = {}
                noun_list = []
                sentences += 1
                continue
            if line.startswith("#"):
                continue
            if line != "\n":
                fields = line.split("\t")
                word_id = fields[0]
                word = fields[1]
                tree_pos = fields[3]
                mor = fields[5]
                head = fields[6]
                ud_edge = fields[7]

                make_default_structure(graph_data, word_id)
                graph_data[word_id]["word"] = word
                graph_data[word_id]["tree_pos"] = tree_pos
                graph_data[word_id]["mor"] = mor

                make_default_structure(graph_data, head)
                graph_data[head]["deps"][word_id] = ud_edge
    return id_to_rules, id_to_sentence

In [4]:
GRAMMAR_FILE = "../count_en_ewt-ud-train.conllu"
TERMINAL_FILE = "../en_ewt-ud-dev.conllu"

In [5]:
rules, sens = extract_rules(TERMINAL_FILE)
sens[1183]

['Technically',
 'was',
 ',',
 'blackberry',
 '',
 'because',
 'first',
 'it',
 'was',
 'the',
 'with',
 'email',
 'real',
 'and',
 'games',
 'and',
 'stuff',
 '.']

In [9]:
#%%capture cap --no-stderr
grammar_fn = open('dep_grammar_spec.irtg', 'w') 
grammar.generate_grammar(GRAMMAR_FILE, rules[1183], grammar_fn)
grammar.generate_terminals(TERMINAL_FILE, grammar_fn)
grammar_fn.close()
#with open('dep_grammar_spec.irtg', 'w') as f:
#    f.write(cap.stdout)

In [None]:
converter.convert(TERMINAL_FILE)

In [None]:
!java -Xmx8G -cp alto-2.3.1-all.jar de.up.ling.irtg.script.ParsingEvaluator graphs_100 -g dep_grammar_spec.irtg -I ud -O string=toString -o surface_eval_ewt

Processing graphs_100 (100 instances) ...
01 [[deserved_0/deserved_0 -nsubj-> He_0/He_0; deserve] 509 ms
02 [[have_0/have_0 -nsubj-> Anyone_0/Anyone_0; recipe_] 129 ms
03 [[arabes_0/arabes_0 -case-> about_0/about_0; arabes] 123 ms
04 [[Yes_0/Yes_0 -punct-> PERIOD_0/PERIOD_0]          ] 41 ms
05 [[here_0/here_0 -advmod-> Here_0/Here_0; here_0 -ns] 88 ms
06 [[cup_0/cup_0 -nummod-> DIGIT_0/DIGIT_0; empanadas_] 62 ms
07 [[cup_0/cup_0 -nummod-> DIGIT_0/DIGIT_0; arabes_0/a] 54 ms
08 [[cup_0/cup_0 -nummod-> DIGIT_0/DIGIT_0; empanadas_] 65 ms
09 [[cup_0/cup_0 -nummod-> DIGIT_0/DIGIT_0; area_0/are] 81 ms
10 [[IT_0/IT_0 -det-> ALL_0/ALL_0; MIX_0/MIX_0 -obj-> ] 80 ms
11 [[POOP_0/POOP_0 -advmod-> THEN_0/THEN_0; IT_0/IT_0 ] 49 ms
12 [[End_0/End_0 -det-> The_0/The_0]                  ] 66 ms
13 [[SQs_0/SQs_0 -nsubj-> It_0/It_0; cooking_0/cooking] 92 ms
14 [[sent_0/sent_0 -nsubj-> I_0/I_0; phone_0/phone_0 -] 104 ms
15 [[sold_0/sold_0 -nsubj_pass-> I_0/I_0; sold_0 -aux_] 100 ms
16 [[phone_0/phone_0 -d