In [1]:
from surface import grammar
from surface import converter
from collections import defaultdict
import ast

In [2]:
def make_default_structure(graph_data, word_id):
    if word_id not in graph_data:
        graph_data[word_id] = {
            "word": "",
            "deps": {},
        }

In [3]:
def get_parse(fn):
    with open(fn, "r") as f:
        next(f)
        parse = next(f).strip()
        return [n.strip() for n in parse.strip("[]").split(",")]

In [4]:
def set_parse(fn, graph):
    with open(fn, "w+") as f:
        f.write("# IRTG unannotated corpus file, v1.0\n")
        f.write("# interpretation ud: de.up.ling.irtg.algebra.graph.GraphAlgebra\n")
        f.write(graph + "\n")
        f.write("(dummy_0 / dummy_0)\n")

In [5]:
def extract_rules(dev):
    graph_data = {}
    noun_list = []
    id_to_rules = defaultdict(list)
    id_to_sentence = {}
    sentences = 0
    with open(dev, "r") as f:
        for i,line in enumerate(f):            
            if line == "\n":
                words = []
                for w in graph_data:
                    words.append(graph_data[w]["word"])
                    subgraphs = {"root": None, "graph": []}
                    rules = []
                    if "tree_pos" not in graph_data[w]:
                        continue
                    
                    subgraphs["root"] = graph_data[w]["tree_pos"]
                    
                    for dep in graph_data[w]["deps"]:                        
                        edge_dep = graph_data[w]["deps"][dep]
                        to_pos = graph_data[dep]["tree_pos"]
                        mor = graph_data[dep]["mor"]
                            
                        if "tree_pos" in graph_data[w]:
                            if "lin=+" in mor:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep.replace(":", "_"), "dir":"S"})
                            elif "lin=-" in mor:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep.replace(":", "_"), "dir":"B"})
                            else:
                                subgraphs["graph"].append({"to":to_pos, "edge":edge_dep.replace(":", "_"), "dir":None})

                    id_to_rules[sentences].append(subgraphs)
                graph_data = {}
                noun_list = []
                sentences += 1
                continue
            if line.startswith("# text"):
                id_to_sentence[sentences] = line.strip()
            if line.startswith("#"):
                continue
            if line != "\n":
                fields = line.split("\t")
                word_id = fields[0]
                word = fields[1]
                tree_pos = fields[3]
                mor = fields[5]
                head = fields[6]
                ud_edge = fields[7]

                make_default_structure(graph_data, word_id)
                graph_data[word_id]["word"] = word
                graph_data[word_id]["tree_pos"] = tree_pos
                graph_data[word_id]["mor"] = mor

                make_default_structure(graph_data, head)
                graph_data[head]["deps"][word_id] = ud_edge
    return id_to_rules, id_to_sentence

In [6]:
GRAMMAR_FILE = "../count_en_ewt-ud-train.conllu"
TERMINAL_FILE_ORIG = "../en_ewt-ud-dev-orig.conllu"
TERMINAL_FILE_MIXED = "../en_ewt-ud-dev-mixed.conllu"

In [7]:
rules, _ = extract_rules(TERMINAL_FILE_MIXED)
graphs, _ = converter.convert(TERMINAL_FILE_MIXED)
_, sentences = converter.convert(TERMINAL_FILE_ORIG)

In [8]:
id_to_parse = {}
for i in range(len(rules)):    
    grammar_fn = open('dep_grammar_spec.irtg', 'w') 
    grammar.generate_grammar(GRAMMAR_FILE, rules[i], grammar_fn)
    grammar.generate_terminals(TERMINAL_FILE_MIXED, grammar_fn)
    grammar_fn.close()
    set_parse("ewt_ones", graphs[i])
    !java -Xmx32G -cp /home/kovacs/projects/alto/build/libs/alto-2.3.6-SNAPSHOT-all.jar de.up.ling.irtg.script.ParsingEvaluator ewt_ones -g dep_grammar_spec.irtg -I ud -O string=toString -o surface_eval_ewt
    parse = get_parse("surface_eval_ewt")
    id_to_parse[i] = parse

KeyError: 'edge'

In [9]:
with open("result_parse", "w") as f:
    for i in id_to_parse:
        f.write(sentences[i] + "\n")
        f.write(graphs[i] + "\n")
        f.write(str(id_to_parse[i]) + "\n")
        f.write("\n")

In [None]:
grammar_fn = open('dep_grammar_spec.irtg', 'w') 
grammar.generate_grammar(GRAMMAR_FILE, rules[39], grammar_fn)
grammar.generate_terminals(TERMINAL_FILE, grammar_fn)
grammar_fn.close()

In [9]:
rules[0]

[{'graph': [{'dir': 'S', 'edge': 'punct', 'to': 'PUNCT'},
   {'dir': None, 'edge': 'nsubj', 'to': 'NOUN'},
   {'dir': None, 'edge': 'obl', 'to': 'PROPN'}],
  'root': 'VERB'},
 {'graph': [], 'root': 'DET'},
 {'graph': [], 'root': 'ADP'},
 {'graph': [{'dir': None, 'edge': 'case', 'to': 'ADP'},
   {'dir': None, 'edge': 'det', 'to': 'DET'}],
  'root': 'PROPN'},
 {'graph': [], 'root': 'DET'},
 {'graph': [], 'root': 'PUNCT'},
 {'graph': [{'dir': None, 'edge': 'det', 'to': 'DET'}], 'root': 'NOUN'}]