## Evaluate Refinement Relationships

This notebook describes the method for evaluating the random sample of refinement relations. First, we compute inter-rater reliability as we develop the heuristics for coding the relations. Next, we evaluate the sampled relations using the ground truth dataset.

In [None]:
data_path = 'data1'

In [None]:
from sklearn.metrics import cohen_kappa_score
import csv

rows1 = [r for r in csv.reader(open('%s/comps_round1.csv' % data_path, 'r'))][1:51]
y1 = ['T' if r[2].startswith('REFINE') else 'F' for r in rows1]
y2 = ['T' if r[3].startswith('REFINE') else 'F' for r in rows1]
print(cohen_kappa_score(y1, y2))

# 0.28810720268006695

In [None]:
rows2 = [r for r in csv.reader(open('%s/comps_round2.csv' % data_path, 'r'))][1:]
y1 = ['T' if r[2].startswith('REFINE') else 'F' for r in rows2[51:100]]
y2 = ['T' if r[3].startswith('REFINE') else 'F' for r in rows2[51:100]]
print(cohen_kappa_score(y1, y2))

# 0.7110362257792755

In [75]:
expected = [r for r in csv.reader(open('%s/comps.csv' % data_path, 'r'))]
predicted = [r for r in csv.reader(open('%s/comps_unlabeled.csv' % data_path, 'r'))]
responses = {}
for r in expected:
    if r[3] != '':
        k = '%s:%s' % (r[0], r[1])
        responses[k] = 'T' if r[3].startswith('REFINE') else 'F'

In [76]:
import networkx as nx

def get_cycles(edges):
    g = nx.DiGraph(edges)
    return [c for c in nx.simple_cycles(g)]

def get_paths(edges):
    g = nx.DiGraph(edges)
    rs = (v for v, d in g.in_degree() if d == 0)
    ls = [v for v, d in g.out_degree() if d == 0]
    paths = []
    for r in rs:
        p = nx.all_simple_paths(g, r, ls)
        paths.extend(p)
    return paths

def get_edge_set(paths):
    e = set()
    for p in paths:
        for i in range(len(p)):
            for j in range(i + 1, len(p)):
                e.add((p[i], p[j]))
    return e

def get_closure(e, edges):
    closure = set()
    for n in edges:
        if n[0] == e[1]:
            closure.add(n)
            closure.update(get_closure(n, edges))

    return closure

def verify_graph(nodes, edges):
    n_list = list(set([e[0] for e in edges]) | set([e[1] for e in edges]))
    return [n for n in n_list if n not in nodes]

In [77]:
import json

graphs = json.load(open('graphs3.json', 'r'))

# create a non-transitive set and implied set
imp_check = set()
non_trans = set()
for i in graphs.keys():
    for j in range(len(graphs[i])):
        nodes, edges, implied = graphs[i][j]
        edges = [(e[0], e[1]) for e in edges]
        
        # validate graph
        missing = verify_graph(nodes, edges)
        if len(missing) > 0:
            continue
        cycles = get_cycles(edges)
        if len(cycles) > 0:
            continue
        for e in edges:
            if e[0] in implied or e[1] in implied:
                imp_check.add('%s:%s' % (nodes[e[0]], nodes[e[1]]))
            else:
                non_trans.add('%s:%s' % (nodes[e[0]], nodes[e[1]]))

print('Implied size: %i' % len(imp_check))
print('Non-transitive size: %i' % len(non_trans))

Implied size: 1139
Non-transitive size: 336


In [78]:
from collections import Counter
c = {'all': Counter(), 'imp': Counter(), 'non': Counter(), 'trn': Counter()}
seen = set()
for r in keyed:
    k = '%s:%s' % (r[1], r[2])
    if k in seen:
        continue
    seen.add(k)
    if k in responses:
        if r[0] == '0':
            if responses[k] == 'T':
                c['all']['tp'] += 1
                if k in imp_check:
                    c['imp']['tp'] += 1
                if k in non_trans:
                    c['non']['tp'] += 1
                else:
                    c['trn']['tp'] += 1
            elif responses[k] == 'F':
                c['all']['fn'] += 1
                if k in similar:
                    c['all']['fs'] += 1 # false similar
                if k in imp_check:
                    c['imp']['fn'] += 1
                if k in non_trans:
                    c['non']['fn'] += 1
                else:
                    c['trn']['fn'] += 1
        elif r[0] == '1':
            if responses[k] == 'T':
                c['all']['fp'] += 1
                if k in similar:
                    c['all']['fs'] += 1 # false similar
                if k in imp_check:
                    c['imp']['fp'] += 1
                if k in non_trans:
                    c['non']['fp'] += 1
                else:
                    c['trn']['fp'] += 1
            elif responses[k] == 'F':
                c['all']['tn'] += 1
                if k in imp_check:
                    c['imp']['tn'] += 1
                if k in non_trans:
                    c['non']['tn'] += 1
                else:
                    c['trn']['tn'] += 1
        elif r[0] == '3':
            if responses[k] == 'T':
                c['all']['t3'] += 1
            elif responses[k] == 'F':
                c['all']['f3'] += 1
                if k in similar:
                    c['all']['fs'] += 1 # false similar

print('Total pairs: %i' % len(seen))
print()
d = c['all']
print('Accuracy: %0.3f' % ((d['tp'] + d['tn'] + d['t3']) / (d['tp'] + d['tn'] + d['fp'] + d['fn'] + d['t3'] + d['f3'])))
print('T3 / F3 = %i / %i' % (d['t3'], d['f3']))
print(d)
print()
d = c['imp']
print('Implied Accuracy: %0.3f' % ((d['tp'] + d['tn']) / (d['tp'] + d['tn'] + d['fp'] + d['fn'])))
print(d)
print()
d = c['non']
print('Non-Closure Accuracy: %0.3f' % ((d['tp'] + d['tn']) / (d['tp'] + d['tn'] + d['fp'] + d['fn'])))
print(d)
print()
d = c['trn']
print('Closure Accuracy: %0.3f' % ((d['tp'] + d['tn']) / (d['tp'] + d['tn'] + d['fp'] + d['fn'])))
print(d)

Total pairs: 526

Accuracy: 0.694
T3 / F3 = 19 / 7
Counter({'tp': 178, 'tn': 168, 'fp': 82, 'fn': 72, 't3': 19, 'f3': 7})

Implied Accuracy: 0.727
Counter({'tp': 125, 'fn': 47})

Non-Closure Accuracy: 0.694
Counter({'tp': 31, 'fn': 11, 'fp': 4, 'tn': 3})

Closure Accuracy: 0.692
Counter({'tn': 165, 'tp': 147, 'fp': 78, 'fn': 61})
