In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os

In [5]:
import csv
from itertools import groupby
def read_conll(filename, columns, delimiter='\t'):
    def is_empty_line(line_pack):
        return all(field.strip() == '' for field in line_pack)
    data = []
    with open(filename) as fp:
        reader = csv.reader(fp, delimiter=delimiter, quoting=csv.QUOTE_NONE)
        groups = groupby(reader, is_empty_line)
        for is_empty, pack in groups:
            if is_empty is False:
                data.append([list(field) for field in zip(*pack)])
    data = list(zip(*data))
    dataset = {colname: list(data[columns[colname]]) for colname in columns}
    return dataset

In [6]:
og_pred_filename = "./baselines/bert_baseline/original_conll03/original_test_predictions.txt"
prop_pred_filename = "./baselines/bert_baseline/proposed_conll03/proposed_test_predictions.txt"
og_label_filename = "./baselines/bert_baseline/original_conll03/original_test_labels.txt"
prop_label_filename = "./baselines/bert_baseline/proposed_conll03/proposed_test_labels.txt"
columns = {"Tokens": 0, "Preds": 1}
delimiter = ' '
og_pred_data = read_conll(og_pred_filename, columns, delimiter)
prop_pred_data = read_conll(prop_pred_filename, columns, delimiter)
og_label_data = read_conll(og_label_filename, columns, delimiter)
prop_label_data = read_conll(prop_label_filename, columns, delimiter)

In [7]:
og_pred_df=pd.DataFrame(og_pred_data)
prop_pred_df=pd.DataFrame(prop_pred_data)
og_label_df=pd.DataFrame(og_label_data)
prop_label_df=pd.DataFrame(prop_label_data)

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

In [8]:
og_pred_df.describe()

Unnamed: 0,Tokens,Preds
count,3453,3453
unique,3184,1499
top,"[LONDON, 1996-12-06]","[B-ORG, O, O, O, O, O, O, O]"
freq,19,170


In [9]:
og_label_df.describe()

Unnamed: 0,Tokens,Preds
count,3453,3453
unique,3184,1469
top,"[LONDON, 1996-12-06]","[B-ORG, O, O, O, O, O, O, O]"
freq,19,167


In [13]:
og_errors=[]
for i in range(3453):
    if og_label_df["Preds"][i]==og_pred_df["Preds"][i]:
        pass
    else:
        for e in range(len(og_pred_df["Preds"][i])):
            if og_label_df["Preds"][i][e] != og_pred_df["Preds"][i][e]:
                og_errors.append([i,og_label_df["Preds"][i][e],og_pred_df["Preds"][i][e],e])
len(og_errors)

1208

In [86]:
f_lines=set([a[0] for a in og_errors])
len(f_lines)

710

In [84]:
def read_data(input_file):
    rf = open(input_file, 'r')
    lines = []; words = []; labels = []
    for line in rf:
        word = line.strip().split(' ')[0]
        label = line.strip().split(' ')[-1]
        if len(line.strip()) == 0 and words[-1] == '.':
            l = ' '.join([label for label in labels if len(label) > 0])
            w = ' '.join([word for word in words if len(word) > 0])
            lines.append((l, w))
            words = []
            labels = []
        words.append(word)
        labels.append(label)
    labels_list=[]
    for l in lines:
        labels_list.append(l[0])
    return labels_list, lines
   
og_preds, og_lines =  read_data("./baselines/bert_baseline/original_conll03/original_test_predictions.txt")
prop_preds, prop_lines =  read_data("./baselines/bert_baseline/proposed_conll03/proposed_test_predictions.txt")

"""
finding incongruent sequences like B-PER I-LOC
currently only works for entities with two words
"""
def incongruent_label(labels):
    incong=[]
    for l in labels:
        ll=l.split(' ')
        for w in range(len(ll)):
            if ll[w]=='O':
                pass
            else:
                if ll[w].split('-')[0]=='B':
                    if ll[w+1].split('-')[0]=='I':
                        if  ll[w].split('-')[1]!=ll[w+1].split('-')[1]:
                            incong.append((ll[w],ll[w+1]))
    return incong

og_incongruent=incongruent_label(og_preds)
prop_incongruent=incongruent_label(prop_preds)
print(len(og_incongruent),og_incongruent,'\n')
print(len(prop_incongruent),prop_incongruent)

52 [('B-ORG', 'I-PER'), ('B-ORG', 'I-PER'), ('B-PER', 'I-ORG'), ('B-LOC', 'I-MISC'), ('B-ORG', 'I-MISC'), ('B-ORG', 'I-MISC'), ('B-LOC', 'I-MISC'), ('B-LOC', 'I-MISC'), ('B-ORG', 'I-MISC'), ('B-LOC', 'I-ORG'), ('B-ORG', 'I-MISC'), ('B-PER', 'I-ORG'), ('B-ORG', 'I-PER'), ('B-LOC', 'I-MISC'), ('B-MISC', 'I-ORG'), ('B-ORG', 'I-MISC'), ('B-ORG', 'I-LOC'), ('B-LOC', 'I-ORG'), ('B-MISC', 'I-LOC'), ('B-LOC', 'I-ORG'), ('B-LOC', 'I-ORG'), ('B-LOC', 'I-ORG'), ('B-LOC', 'I-MISC'), ('B-ORG', 'I-LOC'), ('B-PER', 'I-LOC'), ('B-PER', 'I-LOC'), ('B-MISC', 'I-ORG'), ('B-LOC', 'I-PER'), ('B-LOC', 'I-ORG'), ('B-MISC', 'I-LOC'), ('B-MISC', 'I-ORG'), ('B-LOC', 'I-MISC'), ('B-MISC', 'I-ORG'), ('B-MISC', 'I-ORG'), ('B-MISC', 'I-ORG'), ('B-PER', 'I-ORG'), ('B-PER', 'I-LOC'), ('B-MISC', 'I-ORG'), ('B-LOC', 'I-MISC'), ('B-LOC', 'I-ORG'), ('B-LOC', 'I-ORG'), ('B-LOC', 'I-ORG'), ('B-LOC', 'I-ORG'), ('B-MISC', 'I-LOC'), ('B-MISC', 'I-LOC'), ('B-ORG', 'I-MISC'), ('B-LOC', 'I-MISC'), ('B-ORG', 'I-MISC'), ('B-LOC', 

In [85]:
from collections import Counter
print(Counter(og_incongruent).most_common(20))
print(Counter(prop_incongruent).most_common(20))

[(('B-LOC', 'I-MISC'), 11), (('B-LOC', 'I-ORG'), 10), (('B-ORG', 'I-MISC'), 7), (('B-MISC', 'I-ORG'), 7), (('B-PER', 'I-ORG'), 4), (('B-MISC', 'I-LOC'), 4), (('B-ORG', 'I-PER'), 3), (('B-PER', 'I-LOC'), 3), (('B-ORG', 'I-LOC'), 2), (('B-LOC', 'I-PER'), 1)]
[(('B-LOC', 'I-ORG'), 42), (('B-ORG', 'I-PER'), 21), (('B-MISC', 'I-ORG'), 20), (('B-LOC', 'I-MISC'), 11), (('B-ORG', 'I-MISC'), 11), (('B-ORG', 'I-LOC'), 10), (('B-MISC', 'I-LOC'), 7), (('B-PER', 'I-ORG'), 4), (('B-LOC', 'I-PER'), 4), (('B-MISC', 'I-PER'), 3), (('B-PER', 'I-MISC'), 1)]
