In [1]:
from surface import grammar;
from surface import converter
from collections import defaultdict
import ast

In [2]:
def make_default_structure(graph_data, word_id):
    if word_id not in graph_data:
        graph_data[word_id] = {
            "word": "",
            "deps": {},
        }

In [20]:
def get_parse(fn, conll):
    with open(fn, "r") as f:
        next(f)
        conll_parse = {}
        parse = next(f).strip()
        text = [n.strip() for n in parse.strip("[]").split(",")]
        text_parse = []
        for i, w_id in enumerate(text):
            conll_parse[i] = conll[w_id.split("_")[1]]
            text_parse.append(conll[w_id.split("_")[1]][1])
        return text_parse, conll_parse

In [4]:
def set_parse(fn, graph):
    with open(fn, "w+") as f:
        f.write("# IRTG unannotated corpus file, v1.0\n")
        f.write("# interpretation ud: de.up.ling.irtg.algebra.graph.GraphAlgebra\n")
        f.write(graph + "\n")
        f.write("(dummy_0 / dummy_0)\n")

In [5]:
def extract_rules(dev):
    graph_data = {}
    noun_list = []
    id_to_rules = defaultdict(list)
    id_to_sentence = {}
    sentences = 0
    with open(dev, "r") as f:
        for i,line in enumerate(f):            
            if line == "\n":
                words = []
                for w in graph_data:
                    words.append(graph_data[w]["word"])
                    subgraphs = {"root": None, "graph": []}
                    rules = []
                    if "tree_pos" not in graph_data[w]:
                        continue
                    
                    subgraphs["root"] = graph_data[w]["tree_pos"]
                    
                    for dep in graph_data[w]["deps"]:                        
                        edge_dep = graph_data[w]["deps"][dep]
                        to_pos = graph_data[dep]["tree_pos"]
                        mor = graph_data[dep]["mor"]
                            
                        if "tree_pos" in graph_data[w]:
                            if "lin=+" in mor:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep.replace(":", "_"), "dir":"S"})
                            elif "lin=-" in mor:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep.replace(":", "_"), "dir":"B"})
                            else:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep.replace(":", "_"), "dir":None})

                    id_to_rules[sentences].append(subgraphs)
                graph_data = {}
                noun_list = []
                sentences += 1
                continue
            if line.startswith("# text"):
                id_to_sentence[sentences] = line.strip()
            if line.startswith("#"):
                continue
            if line != "\n":
                fields = line.split("\t")
                word_id = fields[0]
                lemma = fields[1]
                word = fields[2]
                tree_pos = fields[3]
                ud_pos = fields[4]
                mor = fields[5]
                head = fields[6]
                ud_edge = fields[7]
                comp_edge = fields[8]
                space_after = fields[9]
                
                make_default_structure(graph_data, word_id)
                graph_data[word_id]["word"] = lemma
                graph_data[word_id]["tree_pos"] = tree_pos
                graph_data[word_id]["mor"] = mor

                make_default_structure(graph_data, head)
                graph_data[head]["deps"][word_id] = ud_edge
    return id_to_rules, id_to_sentence

In [6]:
GRAMMAR_FILE = "../count_en_ewt-ud-train.conllu"
TERMINAL_FILE_ORIG = "../en_ewt-ud-dev-orig.conllu"
TERMINAL_FILE_MIXED = "../en_ewt-ud-dev-mixed.conllu"

In [7]:
rules, _ = extract_rules(TERMINAL_FILE_MIXED)
graphs, _, id_graphs= converter.convert(TERMINAL_FILE_ORIG)
_, sentences, _ = converter.convert(TERMINAL_FILE_ORIG)
conll = grammar.get_conll_from_file(TERMINAL_FILE_ORIG)

In [None]:
id_to_parse = {}
for i in range(len(rules)):    
    grammar_fn = open('dep_grammar_spec.irtg', 'w') 
    grammar.generate_grammar(GRAMMAR_FILE, rules[i], grammar_fn)
    grammar.generate_terminals(TERMINAL_FILE_MIXED, grammar_fn)
    grammar_fn.close()
    set_parse("ewt_ones", graphs[i])
    !java -Xmx32G -cp alto-2.3.6-SNAPSHOT-all.jar de.up.ling.irtg.script.ParsingEvaluator ewt_ones -g dep_grammar_spec.irtg -I ud -O string=toString -o surface_eval_ewt
    parse = get_parse("surface_eval_ewt")
    id_to_parse[i] = parse

In [13]:
with open("result_parse", "w+") as f:
    for i in id_to_parse:
        f.write(sentences[i] + "\n")
        f.write(graphs[i] + "\n")
        f.write(str(id_to_parse[i]) + "\n")
        f.write("\n")

In [24]:
sen_id = 1623
grammar_fn = open('dep_grammar_spec.irtg', 'w') 
grammar.generate_grammar(GRAMMAR_FILE, rules[sen_id], grammar_fn)
grammar.generate_terminal_ids(conll[sen_id], grammar_fn)
grammar_fn.close()
set_parse("ewt_ones", id_graphs[sen_id])
!java -Xmx32G -cp alto-2.3.6-SNAPSHOT-all.jar de.up.ling.irtg.script.ParsingEvaluator ewt_ones -g dep_grammar_spec.irtg -I ud -O string=toString -o surface_eval_ewt
text_parse, conll_parse = get_parse("surface_eval_ewt", conll[sen_id])

Processing ewt_ones (2 instances) ...
1 [[NOUN_4/NOUN_4 -punct-> PUNCT_12/PUNCT_12; NOUN_4 ] 296 ms
2 [[dummy_0/dummy_0]                                 ] 3 ms
Done, total time: 342 ms


In [23]:
sentences[1048]

"# text = does teacher's camp in baguio also accomodate even 1 person??"

In [None]:
print(text_parse)
print(conll_parse)

In [None]:
graphs[278]

In [None]:
id_graphs[1048]

In [None]:
conll[1048]