# Process the curation results from all three curators

See the corresponding [Thinklab discussion](http://doi.org/10.15363/thinklab.d95#10).

In [1]:
import itertools

import pandas
import sklearn.metrics

In [2]:
id_vars = ['doid_id', 'drugbank_id']
name_vars = ['disease', 'drug']
initials = ['AJG', 'CSH', 'PK']
categories = ['DM', 'SYM', 'NOT']
notes = [x + '_notes' for x in initials]

## Read and process PK curation

In [3]:
pk_df = pandas.read_excel('pk/template-pk final.xlsx', sheetname='curation')
pk_df.to_csv('pk/curation-PK.tsv', index=False, sep='\t')
res_df = pk_df.copy()
res_df['drugbank_id'] = res_df.drug_url.map(lambda x: x.rsplit('/', 1)[-1])
res_df['doid_id'] = res_df.disease_url.map(lambda x: x.rsplit('/', 1)[-1].replace('%3A', ':'))
res_df = res_df[name_vars + initials + notes + id_vars]
res_df = res_df.sort_values(['disease', 'drug'])
res_df['majority'] = res_df[initials].mode('columns')
res_df.to_csv('results-three-curators.tsv', index=False, sep='\t')
res_df.tail(2)

Unnamed: 0,disease,drug,AJG,CSH,PK,AJG_notes,CSH_notes,PK_notes,doid_id,drugbank_id,majority
1042,vitiligo,Methoxsalen,DM,DM,DM,,,,DOID:12306,DB00553,DM
273,vitiligo,Monobenzone,DM,SYM,SYM,,,,DOID:12306,DB00600,SYM


In [4]:
pk_minority_df = res_df[res_df.majority.notnull() & (res_df.PK != res_df.majority)]
pk_minority_df.to_csv('results-PK-changes.tsv', index=False, sep='\t')
len(pk_minority_df)

124

In [5]:
# All three curators disagree
disagree_df = res_df[res_df.majority.isnull()]
disagree_df.to_csv('results-threeway-disagreements.tsv', index=False, sep='\t')
len(disagree_df)

34

## Curator agreement

In [6]:
# Cohen's Kappa
rows = list()
for a, b in itertools.combinations(initials, 2):
    kappa = sklearn.metrics.cohen_kappa_score(pk_df[a], pk_df[b])
    rows.append((a, b, kappa))
kappa_df = pandas.DataFrame(rows, columns=['curator_a', 'curator_b', 'kappa'])
kappa_df

Unnamed: 0,curator_a,curator_b,kappa
0,AJG,CSH,0.498619
1,AJG,PK,0.514723
2,CSH,PK,0.651177


## Resource breakdown by PK curation

In [7]:
source_df = pandas.read_table('../data/indications-slim-verbose.tsv')
source_df = source_df.rename(columns={'do_slim_id': 'doid_id'})
source_df = source_df[['drugbank_id', 'doid_id', 'resource']].drop_duplicates()
melt_df = pandas.melt(res_df, id_vars=id_vars + name_vars, value_vars=initials, var_name='curator', value_name='classification')
source_df = melt_df.merge(source_df)
#source_df.groupby(['curator', 'classification', 'resource']).apply(lambda df: pandas.Series({'count': len(df)})).reset_index()
source_df.head(2)

Unnamed: 0,doid_id,drugbank_id,disease,drug,curator,classification,resource
0,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,AJG,SYM,predict
1,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,CSH,SYM,predict


In [8]:
# Find the categorical breakdown by resource of PK's curation
pk_source_df = source_df.query("curator == 'PK'")
source_table = pandas.crosstab(pk_source_df.resource, pk_source_df.classification)[categories]
source_table.sort_values('DM', ascending=False)

classification,DM,SYM,NOT
resource,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
medi_hps,532,168,93
predict,346,158,76
ehrlink,205,163,95
labeledin,183,72,22
medi_lps,174,50,45


In [9]:
# Indications per resource
source_table.sum('columns')

resource
ehrlink      463
labeledin    277
medi_hps     793
medi_lps     269
predict      580
dtype: int64

In [10]:
# Percent breakdown by resource
(source_table.divide(source_table.sum('columns'), 'rows') * 100).sort_values('DM', ascending=False).round(1)

Unnamed: 0_level_0,DM,SYM,NOT
resource,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
medi_hps,67.1,21.2,11.7
labeledin,66.1,26.0,7.9
medi_lps,64.7,18.6,16.7
predict,59.7,27.2,13.1
ehrlink,44.3,35.2,20.5
