In [1]:
import sys
import os
import re

from decomp import UDSCorpus

sys.path.insert(0, "/home/hltcoe/estengel/miso_research")


In [2]:
# get all possible "with" pp ambiguity sentences 
def read_uds(old_path, new_path):
    with open(old_path) as f1, open(new_path) as f2:
        old_data = f1.read()
        new_data = f2.read()
        
    old_data = old_data.split("\n\n")
    new_data = new_data.split("\n\n")
    assert(len(old_data) == len(new_data))

    examples = []
    for i, block in enumerate(old_data):
        new_block = new_data[i]
        block = block.strip().split("\n")
        try:
            text = " ".join([line.split('\t')[1] for line in block])
        except IndexError:
            pass

        for line in block:
            if "obl:with" in new_block \
            or "obl:in" in new_block:
                examples.append(text)
                break
                
    print(len(examples))
    return examples    

# need new data to get the obliques, old data to get the correct sentence tokenization 
example_sents = read_uds("/home/hltcoe/estengel//Parser-v1/data/EWT/en-ud-dev.conllu",
                        "/home/hltcoe/estengel//Parser-v1/data/EWT/en_ewt-ud-dev.conllu")

254


In [3]:
# subset dev data for those sentneces 
c = UDSCorpus(split="dev")
i=0
pp_graphs = {}
for gname, g in c.items():
    if g.sentence in example_sents:
        pp_graphs[gname] = g
        i+=1  
# make sure they're all still there
assert(i==len(example_sents))

subcorpus = UDSCorpus()
# manually set subcorpus graphs 
subcorpus._graphs = pp_graphs
subcorpus.to_json("/home/hltcoe/estengel/miso_research/analysis/pp_graphs.json")


In [4]:
# NO SEMANTICS UAS: 93.3966249125261, LAS: 90.78295031582306, MLAS: 86.0005524913146, BLEX: 87.83526099148335
# WITH SEMANTICS: UAS: 94.34984112898016, LAS: 92.278784184067, MLAS: 88.48704077260655, BLEX: 89.59837183460822
# Is this significantly higher than the other one

def get_with_root(block, do_print=False):
    block = [x for x in block if x != '']
    double_head = None 
    for line in block:
        line = line.split("\t")
        if line == [""]:
            continue

        if line[1] == "with":
            with_head = int(line[6])
            if do_print:
                print(block[with_head-1].split("\t"))
                if block[with_head-1].split("\t")[1] == "with":
                    print("ERROR")
                    print(block)
                    sys.exit()
            double_head = int(block[with_head-1].split("\t")[6])
    return double_head 


def get_with_acc(pred_blocks, true_blocks, do_print=False):
    total = 0
    correct = 0 
    for pb, tb in zip(pred_blocks, true_blocks):
        pred_head = get_with_root(pb)
        true_head = get_with_root(tb, do_print)
        total += 1
        if pred_head == true_head:
            correct += 1
    return correct/total 

def read_log(path):
    with open(path) as f1:
        blocks = f1.read()

    # drop final line 
    blocks = blocks.split("\n\n")[:-1]
    # logs alternate pred and true 
    pred_blocks, true_blocks = [], []
    for i, b in enumerate(blocks):
        pred, true = False, False
        if i % 2 == 0:
            true=True
        else:
            pred=True
            
        b = b.split("\n")
       
        if pred:
            # drop comments 
            b = [x for x in b if not x.startswith("#")]
            pred_blocks.append(b)
        elif true:
            b = [x for x in b if not x.startswith("#")]
            true_blocks.append(b)
        else:
            raise AssertionError()
            
            
    return pred_blocks, true_blocks

pred_blocks1, true_blocks1 = read_log("/home/hltcoe/estengel/miso_research/analysis/with_sem.out")
with_sem_acc = get_with_acc(pred_blocks1, true_blocks1)         

print(f"with semantics: {with_sem_acc}")

pred_blocks2, true_blocks2 = read_log("/home/hltcoe/estengel/miso_research/analysis/no_sem.out")
no_sem_acc = get_with_acc(pred_blocks2, true_blocks2, do_print=False)         

print(f"no semantics: {no_sem_acc}")



with semantics: 0.8333333333333334
no semantics: 0.8333333333333334
