In [1]:
import pandas as pd
import os

In [40]:
ANNO_FOLDER = 'pilot_annotations'
G1 = 'group1'
G2 = 'group2'
F1 = 'dev_1-100.tsv'
F2 = 'dev_101-200.tsv'
F3 = 'dev_201-300.tsv'
F4 = 'dev_301-400.tsv'
FILES = [F1, F2, F3, F4]

In [22]:
groups = {}_b
groups[G1] = os.listdir(os.path.join(ANNO_FOLDER, G1))
groups[G2] = os.listdir(os.path.join(ANNO_FOLDER, G2))

groups

{'group1': ['anatb', 'dafnaa', 'shayp', 'sinair', 'tzufa', 'vikab'],
 'group2': ['nuritg', 'shovals', 'tzipyl', 'vereds', 'yohayg', 'zefs']}

In [188]:
import re
def get_biose(df):
    ner_bio = df.ner.copy()
    ner_bio[~ner_bio.str.contains('\[|_')] = 'S-' + ner_bio 
    ner_bio = ner_bio.replace('_', 'O')

    RE = '^(?P<type>[A-Z]+)\[(?P<num>\d+)\].*$'
    prev='XXX'
    new = []

    for x in ner_bio.tolist():
        if x.startswith('S') or x=='O':
            new.append(x)
        else:
            typ, num = re.match(RE, x).groups(0)
            if x==prev:
                new.append('I-'+typ)
            else:
                new.append('B-'+typ)

        prev = x
        
    # reverse pass to add Es
    prev='O'
    rev_pass = []
    for x in new[::-1]:
        if((prev=='O' or prev.startswith('B-')) and x.startswith('I-')):
            rev_pass.append(x.replace('I-', 'E-'))
        else:
            rev_pass.append(x)
        prev = x

    biose = rev_pass[::-1]
    
    return biose

In [189]:
def read_tsv3(path, add_biose=True):
    names = ['sent_tok_num', 'tok_offset', 'token',
             'FEAT_gender', 'FEAT_number', 'FEAT_case', 'FEAT_degree', 'FEAT_transitivity', 
             'FEAT_tense', 'FEAT_mood', 'FEAT_voice', 'FEAT_definiteness', 'FEAT_value', 'FEAT_person', 'FEAT_aspect',
             'pos',
             'ner',
             'lemma',
             'surface_form',
             'dep_type', 'dep_flavor', 'dep_lex_morph_pos', 'dep_arc', 'EXTRA'
            ]
    df = (pd.read_csv(path, sep='\t', skiprows=10, header=None, names=names,
                      comment='#', skip_blank_lines=True, quoting=3)#na_values=['*', '_'])
          .assign(ner_type = lambda x: x.ner.str.split('[', expand=True).iloc[:,0])
          .assign(is_ner = lambda x: x.ner!='_')
          .assign(biose = lambda x: get_biose(x))
         )
    return df
df = read_tsv3(os.path.join(ANNO_FOLDER, G1, 'anatb', F1))
df.head(60)

Unnamed: 0,sent_tok_num,tok_offset,token,FEAT_gender,FEAT_number,FEAT_case,FEAT_degree,FEAT_transitivity,FEAT_tense,FEAT_mood,...,lemma,surface_form,dep_type,dep_flavor,dep_lex_morph_pos,dep_arc,EXTRA,ner_type,is_ner,biose
0,1-1,0-5,עשרות,Fem,Plur,*,*,*,*,*,...,עשרות,_,nummod,basic,1-2,,,_,False,O
1,1-2,6-11,אנשים,Masc,Plur,*,*,*,*,*,...,איש,_,nsubj,basic,1-3,,,_,False,O
2,1-3,12-18,מגיעים,Masc,Plur,*,*,*,*,*,...,הגיע,_,root,basic,1-3,,,_,False,O
3,1-4,19-20,מ,_,_,_,_,_,_,_,...,מ,מתאילנד[65],case,basic,1-5,,,_,False,O
4,1-5,21-27,תאילנד,_,_,_,_,_,_,_,...,תאילנד,מתאילנד[65],obl,basic,1-3,,,GPE,True,S-GPE
5,1-6,28-29,ל,_,_,_,_,_,_,_,...,ל,לישראל[66],case,basic,1-7,,,_,False,O
6,1-7,30-35,ישראל,_,_,_,_,_,_,_,...,ישראל,לישראל[66],obl,basic,1-3,,,GPE,True,S-GPE
7,1-8,36-38,כש,*,*,Tem,*,*,*,*,...,כש,כשהם[67],mark,basic,1-10,,,_,False,O
8,1-9,39-41,הם,Masc,Plur,*,*,*,*,*,...,הוא,כשהם[67],nsubj,basic,1-10,,,_,False,O
9,1-10,42-48,נרשמים,Masc,Plur,*,*,*,*,*,...,נרשם,_,advcl,basic,1-3,,,_,False,O


In [190]:
df.ner_type.value_counts()

_      2529
ORG      82
GPE      50
PER      45
LOC      42
EVE      20
FAC       6
ANG       1
Name: ner_type, dtype: int64

## Cohen Kappa

In [191]:
annos = {}
for g, members in groups.items():
    annos[g] = {}
    for f in FILES:
        annos[g][f] = {}
        for m in members:
            path = os.path.join(ANNO_FOLDER, g, m, f)
            annos[g][f][m] = read_tsv3(path)

In [192]:
from sklearn.metrics import cohen_kappa_score
from itertools import combinations 

In [193]:
annos[G1][F1][groups[G1][0]].ner_type.shape, annos[G1][F1][groups[G1][1]].ner_type.shape

((2775,), (2775,))

In [194]:
annos[G1][F1][groups[G1][0]].ner_type.unique()

array(['_', 'GPE', 'ORG', 'PER', 'EVE', 'FAC', 'LOC', 'ANG'], dtype=object)

In [195]:
annos[G1][F1][groups[G1][1]].ner_type.unique()

array(['_', 'GPE', 'ORG', 'PER', 'EVE', 'LOC', 'FAC'], dtype=object)

In [196]:
cohen_kappa_score(annos[G1][F1][groups[G1][0]].ner_type, annos[G1][F1][groups[G1][1]].ner_type)

0.8976390901867242

In [197]:
cohen_kappa_score(annos[G1][F1][groups[G1][0]].is_ner, annos[G1][F1][groups[G1][1]].is_ner)

0.9158688367640019

In [198]:
scores = []
for g in groups:
    for f in FILES:
        for a, b in combinations(groups[g], 2):
            type_sc = cohen_kappa_score(annos[g][f][a].ner_type, annos[g][f][b].ner_type)
            bool_sc = cohen_kappa_score(annos[g][f][a].is_ner, annos[g][f][b].is_ner)
            #print(g, f, a, b, sc)
            scores.append((g, f, a, b, type_sc, bool_sc))
scores = pd.DataFrame(scores, columns=['group', 'file', 'a', 'b', 'type_sc', 'bool_sc'])
scores.head()

Unnamed: 0,group,file,a,b,type_sc,bool_sc
0,group1,dev_1-100.tsv,anatb,dafnaa,0.897639,0.915869
1,group1,dev_1-100.tsv,anatb,shayp,0.883459,0.905985
2,group1,dev_1-100.tsv,anatb,sinair,0.809746,0.85933
3,group1,dev_1-100.tsv,anatb,tzufa,0.773987,0.820183
4,group1,dev_1-100.tsv,anatb,vikab,0.805374,0.868539


In [199]:
scores[(scores.a!='nuritg') & (scores.b!='nuritg')].groupby(['group', 'file']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,type_sc,bool_sc
group,file,Unnamed: 2_level_1,Unnamed: 3_level_1
group1,dev_1-100.tsv,0.804903,0.85738
group1,dev_101-200.tsv,0.89175,0.907298
group1,dev_201-300.tsv,0.815467,0.845872
group1,dev_301-400.tsv,0.877136,0.914304
group2,dev_1-100.tsv,0.729188,0.791572
group2,dev_101-200.tsv,0.76013,0.817839
group2,dev_201-300.tsv,0.781875,0.814547
group2,dev_301-400.tsv,0.800863,0.839371


## conlleval
https://github.com/sighsmile/conlleval

In [200]:

from conlleval import evaluate

evaluate(annos[G1][F1][groups[G1][0]].biose.tolist(), 
         annos[G1][F1][groups[G1][2]].biose.tolist(), 
         verbose=True) 

processed 2775 tokens with 110 phrases; found: 111 phrases; correct: 94.
accuracy:  85.37%; (non-O)
accuracy:  97.98%; precision:  84.68%; recall:  85.45%; FB1:  85.07
              ANG: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              EVE: precision: 100.00%; recall:  20.00%; FB1:  33.33  1
              FAC: precision:  40.00%; recall: 100.00%; FB1:  57.14  5
              GPE: precision:  85.71%; recall:  90.00%; FB1:  87.80  42
              LOC: precision: 100.00%; recall:  83.33%; FB1:  90.91  10
              ORG: precision:  75.00%; recall:  80.77%; FB1:  77.78  28
              PER: precision:  96.00%; recall: 100.00%; FB1:  97.96  25


(84.68468468468468, 85.45454545454545, 85.06787330316742)

In [201]:
prec, rec, f1 = evaluate(annos[G1][F1][groups[G1][0]].biose.tolist(), 
                         annos[G1][F1][groups[G1][1]].biose.tolist(), 
                         verbose=False) 
prec, rec, f1

(82.4074074074074, 80.9090909090909, 81.651376146789)

In [202]:
conll_scores = []
for g in groups:
    for f in FILES:
        for a, b in combinations(groups[g], 2):
            prec, rec, f1 = evaluate(annos[g][f][a].biose.tolist(), 
                                     annos[g][f][b].biose.tolist(), 
                                     verbose=False) 
            #print(g, f, a, b, sc)
            conll_scores.append((g, f, a, b, prec, rec, f1))
conll_scores = pd.DataFrame(conll_scores, columns=['group', 'file', 'a', 'b', 'prec', 'rec', 'f1'])
conll_scores.head()

Unnamed: 0,group,file,a,b,prec,rec,f1
0,group1,dev_1-100.tsv,anatb,dafnaa,82.407407,80.909091,81.651376
1,group1,dev_1-100.tsv,anatb,shayp,84.684685,85.454545,85.067873
2,group1,dev_1-100.tsv,anatb,sinair,71.794872,76.363636,74.008811
3,group1,dev_1-100.tsv,anatb,tzufa,68.867925,66.363636,67.592593
4,group1,dev_1-100.tsv,anatb,vikab,71.551724,75.454545,73.451327


In [204]:
conll_scores[(conll_scores.a!='nuritg') & (conll_scores.b!='nuritg')].groupby(['group', 'file']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prec,rec,f1
group,file,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group1,dev_1-100.tsv,72.358024,73.731377,72.990749
group1,dev_101-200.tsv,85.129476,83.252584,84.161336
group1,dev_201-300.tsv,73.222315,66.522738,69.235302
group1,dev_301-400.tsv,86.7522,83.237341,84.86625
group2,dev_1-100.tsv,65.136054,68.154908,66.362754
group2,dev_101-200.tsv,69.013647,67.789579,67.895156
group2,dev_201-300.tsv,64.394644,67.102883,65.613404
group2,dev_301-400.tsv,81.371001,82.095879,81.722892


## TODO

1. nested tags - only leaves, only tops, all