In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

In [2]:
import sys
from pathlib import Path
import os
cwd = os.getcwd()
parent = str(Path(cwd).parents[0])
sys.path.append(parent)

In [3]:
def reco_checker(df, field1, field2):
    return df[((df[field1] == 1) & (df[field2] == 0)) | ((df[field1] == 0) & (df[field2] == 1))]

In [4]:
reco_data = pd.read_csv(parent + '/data/dual_coding/' + 'dual_coding.csv')
study_data = pd.read_csv(parent + '/data/final_dataset/' + 'analysis_df.csv')

In [5]:
#These were all the trials that were assigned for dual coding and were excluded. Uncomment to view.
#We will exclude these from the rest of the analysis

#reco_data[reco_data.euctr_res_nd.isna()]

In [6]:
#Combining the reconcilliation data with the final data

df = reco_data[reco_data.euctr_res_nd.notnull()].merge(study_data[['euctr_id', 'euctr_results', 'nct_id', 'isrctn_id', 'journal_results_inc']], how='left', left_on='trial_id', right_on='euctr_id').reset_index(drop=True)

In [7]:
#The denominators here are out of 241 after exclusions. 
len(df)

241

In [23]:
df.columns

Index(['trial_id', 'second_coder', 'euctr_res_nd', 'euctr_res_2nd', 'nct_nd',
       'nct_2nd', 'nct_match', 'isrctn_nd', 'isrctn_2nd', 'pub_nd', 'pub_2nd',
       'pub_match', 'pub_date_match', 'pub_reg_nd', 'pub_reg_2nd',
       'pub_reg_match', 'euctr_id', 'euctr_results', 'nct_id', 'isrctn_id',
       'journal_results_inc'],
      dtype='object')

In [8]:
#Did we extract the same information about EUCTR results?

In [9]:
reco_checker(df, 'euctr_res_nd', 'euctr_res_2nd')

Unnamed: 0,trial_id,second_coder,euctr_res_nd,euctr_res_2nd,nct_nd,nct_2nd,nct_match,isrctn_nd,isrctn_2nd,pub_nd,...,pub_match,pub_date_match,pub_reg_nd,pub_reg_2nd,pub_reg_match,euctr_id,euctr_results,nct_id,isrctn_id,journal_results_inc
110,2009-016360-37,HD,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2009-016360-37,No,,,0.0


In [10]:
#Percent Agreement

1 - (len(reco_checker(df, 'euctr_res_nd', 'euctr_res_2nd'))/len(df))

0.995850622406639

In [11]:
#cohen's kappa

cohen_kappa_score(df.euctr_res_nd, df.euctr_res_2nd)

0.9916548356937567

In [13]:
#Now did we find a ClinicalTrials.gov cross registration?

In [14]:
reco_checker(df, 'nct_nd', 'nct_2nd')[['trial_id', 'nct_nd', 'nct_2nd', 'nct_id']] 

Unnamed: 0,trial_id,nct_nd,nct_2nd,nct_id
105,2007-000212-89,1.0,0.0,NCT00633347
107,2007-005702-49,1.0,0.0,NCT00713310
127,2013-002165-19,1.0,0.0,NCT01974739
128,2013-002997-33,1.0,0.0,NCT01944839
140,2004-004296-11,1.0,0.0,NCT00088192
162,2006-005380-25,1.0,0.0,NCT00929682
231,2014-001786-26,1.0,0.0,NCT02829177


In [15]:
1 - (len(reco_checker(df, 'nct_nd', 'nct_2nd'))/len(df))

0.970954356846473

In [12]:
cohen_kappa_score(df.nct_nd, df.nct_2nd)

0.9334962746875862

In [16]:
#When we did both find a ClinicalTrials.gov registration, did we find the same one

In [17]:
df[(df.nct_nd == 1) & (df.nct_2nd == 1)].nct_match.value_counts()

1.0    158
0.0      2
Name: nct_match, dtype: int64

In [41]:
158/160

0.9875

In [19]:
#What about finding the same ISRCTN registration

In [20]:
reco_checker(df, 'isrctn_nd', 'isrctn_2nd')

Unnamed: 0,trial_id,second_coder,euctr_res_nd,euctr_res_2nd,nct_nd,nct_2nd,nct_match,isrctn_nd,isrctn_2nd,pub_nd,...,pub_match,pub_date_match,pub_reg_nd,pub_reg_2nd,pub_reg_match,euctr_id,euctr_results,nct_id,isrctn_id,journal_results_inc
183,2008-006135-12,JM,1.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,2008-006135-12,Yes,,ISRCTN53154834,1.0
193,2009-017842-30,JM,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2009-017842-30,Yes,,ISRCTN83567338,1.0
214,2011-005683-21,JM,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2011-005683-21,No,,ISRCTN33941607,1.0


In [21]:
1 - (len(reco_checker(df, 'isrctn_nd', 'isrctn_2nd'))/len(df))

0.9875518672199171

In [22]:
cohen_kappa_score(df.isrctn_nd, df.isrctn_2nd)

0.8900045641259698

In [None]:
#Now for publications

In [24]:
reco_checker(df, 'pub_nd', 'pub_2nd')

Unnamed: 0,trial_id,second_coder,euctr_res_nd,euctr_res_2nd,nct_nd,nct_2nd,nct_match,isrctn_nd,isrctn_2nd,pub_nd,...,pub_match,pub_date_match,pub_reg_nd,pub_reg_2nd,pub_reg_match,euctr_id,euctr_results,nct_id,isrctn_id,journal_results_inc
72,2012-001142-18,JS,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2012-001142-18,No,,,1.0
74,2012-001809-24,JS,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,2.0,2012-001809-24,No,,,1.0
78,2012-004300-35,JS,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2012-004300-35,No,NCT02376075,,1.0
95,2004-000414-39,HD,1.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,2.0,2004-000414-39,Yes,,,1.0
102,2006-005263-26,HD,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,2.0,2006-005263-26,Yes,NCT00413556,,1.0
105,2007-000212-89,HD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2007-000212-89,No,NCT00633347,,1.0
107,2007-005702-49,HD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2007-005702-49,No,NCT00713310,,1.0
116,2011-002472-16,HD,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2011-002472-16,No,NCT01549015,,1.0
119,2011-004559-38,HD,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,2.0,2.0,2011-004559-38,Yes,,ISRCTN62162141,1.0
125,2013-001557-27,HD,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,2.0,2.0,2013-001557-27,Yes,NCT01821118,,1.0


In [25]:
1 - (len(reco_checker(df, 'pub_nd', 'pub_2nd'))/len(df))

0.8464730290456431

In [26]:
cohen_kappa_score(df.pub_nd, df.pub_2nd)

0.6972875717147028

In [None]:
#Did we find the same publication?

In [27]:
df[(df.pub_nd == 1) & (df.pub_2nd == 1)].pub_match.value_counts()

1.0    98
0.0    12
Name: pub_match, dtype: int64

In [42]:
98/110

0.8909090909090909

In [None]:
#When we both found the same pub, did the extracted publication date match

In [34]:
df[(df.pub_match == 1)].pub_date_match.value_counts()

1.0    67
0.0    31
Name: pub_date_match, dtype: int64

In [43]:
67/98

0.6836734693877551

In [None]:
#When we both found the same pub, did the extracted trial ID match

In [38]:
df[(df.pub_match == 1)].pub_reg_match.value_counts()

1.0    86
0.0    12
Name: pub_reg_match, dtype: int64

In [40]:
86/(12+86)

0.8775510204081632