In [1]:
import sys
from pathlib import Path
import os
cwd = os.getcwd()
parent = str(Path(cwd).parents[0])
sys.path.append(parent)

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(parent + '/data/cleaned_ictrp_29June2020.csv').drop('index', axis=1)

In [4]:
df['date_registration'] = pd.to_datetime(df['date_registration'])

In [5]:
#exclusion logic

int_prev = ((df.study_type == 'Interventional') | (df.study_type == 'Prevention'))

in_2020 = (df.date_registration >= pd.Timestamp(2020,1,1))

#At the moment, this deals with withdrawn trials from the ChiCTR. Data from other registries doesn't
#Reliable make it to the ICTRP. We will exclude withdrawn trials from ClinicalTrials.gov
#When we join that in below.
withdrawn = ~((df.public_title.str.contains('Cancelled')) | df.public_title.str.contains('Retracted due to'))

In [6]:
df['included'] = np.where(int_prev & in_2020 & withdrawn, 1, 0)

In [8]:
registry_data = pd.read_csv(parent + '/data/registry_data/registry_data_clean.csv')

In [9]:
#Taking only what we need to join
reg_cols = ['trial_id', 'trial_status', 'pcd', 'scd', 'relevent_comp_date', 'tabular_results', 
            'potential_other_results']


df_reg_merge = df.merge(registry_data[reg_cols], how='left', left_on='trialid', 
                        right_on='trial_id').drop('trial_id', axis=1)

df_reg_merge['tabular_results'] = df_reg_merge['tabular_results'].fillna(0).astype(int)
df_reg_merge['potential_other_results'] = df_reg_merge['potential_other_results'].fillna(0).astype(int)

In [10]:
#excluding more withdrawn trials

df_reg_merge['included'] = np.where((df_reg_merge.trial_status == 'Withdrawn'), 0, df_reg_merge['included'])
df_reg_merge = df_reg_merge.drop('trial_status', axis=1)

In [11]:
auto_hits = pd.read_csv(parent + '/data/screening_hit_results.csv')

In [12]:
def group_rules(grp):
    l = []
    for x in grp:
        if x in l:
            pass
        else:
            l.append(x)
    if len(l) == 0:
        return np.nan
    else:
        return l

def max_list_size(column):
    max_size = 0
    for x in column:
        if len(x) > max_size:
            max_size = len(x)
    return max_size

In [13]:
group_auto = auto_hits.groupby('trn_1', as_index=False).agg(group_rules)

filtered = group_auto[['trn_1', 'trn_2', 'id', 'doi', 'results_pub_type',  
                       'completion_date', 'publication_date']].reset_index(drop=True)

rename = ['hit_tid', 'hit_tid2', 'auto_id', 'doi', 'results_pub_type', 'pub_completion_date', 'publication_date']

filtered.columns = rename

In [14]:
for name in rename[2:]:
    col_list = filtered[name].tolist()
    max_size = max_list_size(col_list)
    cols = [(name + '_{}').format(x) for x in range(1, max_size+1)]
    filtered[cols] = pd.DataFrame(col_list, index=filtered.index)
    filtered = filtered.drop(name, axis=1)

#Fixing this
filtered['hit_tid2'] = filtered['hit_tid2'].str[0]

In [15]:
df_final = df_reg_merge.merge(filtered, how='left', left_on='trialid', right_on='hit_tid').drop('hit_tid', axis=1)

In [16]:
#Check for trials that are in our results but not in the ICTRP dataset

a = df_reg_merge.trialid.tolist()
b = filtered.hit_tid.tolist()

set(b) - set(a)

{'NCT04323527'}

In [17]:
df_final.head()

Unnamed: 0,trialid,source_register,date_registration,date_enrollement,retrospective_registration,normed_spon_names,recruitment_status,phase,study_type,countries,...,hit_tid2,auto_id_1,auto_id_2,doi_1,doi_2,results_pub_type_1,results_pub_type_2,pub_completion_date_1,publication_date_1,publication_date_2
0,NCT04246242,ClinicalTrials.gov,2020-01-27,2020-01-25,True,Xiangya Hospital of Central South University,Not Recruiting,Phase 4,Interventional,No Country Given,...,,,,,,,,,,
1,NCT04252885,ClinicalTrials.gov,2020-01-30,2020-01-28,True,Guangzhou 8th People's Hospital,Recruiting,Phase 4,Interventional,China,...,,9hknw4ws,ytqjxzaa,http://doi.org/10.1101/2020.03.19.20038984,http://doi.org/10.1016/j.medj.2020.04.001,full_results_preprint,full_results_journal_article,,15/04/2020,04/05/2020
2,NCT04255940,ClinicalTrials.gov,2020-02-03,2020-01-20,True,Qilu Hospital of Shandong University,Recruiting,Not Applicable,Observational,China,...,,,,,,,,,,
3,NCT04260308,ClinicalTrials.gov,2020-02-03,2020-02-03,False,Huazhong University of Science and Technology,Recruiting,Not Applicable,Observational,China,...,,,,,,,,,,
4,NCT04260594,ClinicalTrials.gov,2020-02-06,2020-02-07,False,Jieming QU,Not Recruiting,Phase 4,Interventional,No Country Given,...,,,,,,,,,,


In [18]:
df_final.to_csv(parent + '/data/final_dataset.csv')