# Process the curation results from all three curators

See the corresponding [Thinklab discussion](http://doi.org/10.15363/thinklab.d95#10).

In [1]:
import itertools

import pandas
import sklearn.metrics

In [2]:
id_vars = ['doid_id', 'drugbank_id']
name_vars = ['disease', 'drug']
initials = ['AJG', 'CSH', 'PK']
categories = ['DM', 'SYM', 'NOT']
notes = [x + '_notes' for x in initials]

## Read and process PK curation

In [3]:
pk_df = pandas.read_excel('pk/template-pk final.xlsx', sheetname='curation')
pk_df.to_csv('pk/curation-PK.tsv', index=False, sep='\t')
res_df = pk_df.copy()
res_df['drugbank_id'] = res_df.drug_url.map(lambda x: x.rsplit('/', 1)[-1])
res_df['doid_id'] = res_df.disease_url.map(lambda x: x.rsplit('/', 1)[-1].replace('%3A', ':'))
res_df = res_df[name_vars + initials + notes + id_vars]
res_df = res_df.sort_values(['disease', 'drug'])
res_df['majority'] = res_df[initials].mode('columns')
res_df.to_csv('results-three-curators.tsv', index=False, sep='\t')
res_df.tail(2)

Unnamed: 0,disease,drug,AJG,CSH,PK,AJG_notes,CSH_notes,PK_notes,doid_id,drugbank_id,majority
1042,vitiligo,Methoxsalen,DM,DM,DM,,,,DOID:12306,DB00553,DM
273,vitiligo,Monobenzone,DM,SYM,SYM,,,,DOID:12306,DB00600,SYM


In [4]:
pk_minority_df = res_df[res_df.majority.notnull() & (res_df.PK != res_df.majority)]
pk_minority_df.to_csv('results-PK-changes.tsv', index=False, sep='\t')
len(pk_minority_df)

124

In [5]:
# All three curators disagree
disagree_df = res_df[res_df.majority.isnull()]
disagree_df.to_csv('results-threeway-disagreements.tsv', index=False, sep='\t')
len(disagree_df)

34

## Curator agreement

In [6]:
# Cohen's Kappa
rows = list()
for a, b in itertools.combinations(initials, 2):
    kappa = sklearn.metrics.cohen_kappa_score(pk_df[a], pk_df[b])
    rows.append((a, b, kappa))
kappa_df = pandas.DataFrame(rows, columns=['curator_a', 'curator_b', 'kappa'])
kappa_df

Unnamed: 0,curator_a,curator_b,kappa
0,AJG,CSH,0.498619
1,AJG,PK,0.514723
2,CSH,PK,0.651177
