In [1]:
import pandas as pd
import numpy as np
import pathlib

raw = pathlib.Path('../data/raw')

In [2]:
studies = pd.read_csv(raw/'studies.txt', sep='|')
studies_cols = [
    'nct_id', 'biospec_retention', 'is_ppsd', 'phase',
    'is_unapproved_device', 'is_fda_regulated_device', 'is_fda_regulated_drug',
    'number_of_groups', 'number_of_arms', 'overall_status'
]

studies = studies[studies_cols]
studies = studies[studies['overall_status'].isin(['COMPLETED', 'TERMINATED'])]

  studies = pd.read_csv(raw/'studies.txt', sep='|')


In [3]:
calc_vals = pd.read_csv(raw/'calculated_values.txt', sep='|')
calc_vals_cols = [
    'nct_id', 'number_of_facilities', 'has_us_facility',
    'minimum_age_num', 'minimum_age_unit', 'maximum_age_num', 'maximum_age_unit',
    'number_of_primary_outcomes_to_measure', 'number_of_secondary_outcomes_to_measure'
]

calc_vals = calc_vals[calc_vals_cols]
# calc_vals = calc_vals[~calc_vals['has_us_facility'].isna()]
# calc_vals['has_us_facility'] = calc_vals['has_us_facility'] == 't'


scalers = {
    'year':1.0, 'month':1.0/12.0, 'week':7.0/365.25,
    'day':1.0/365.25, 'hour':1.0/365.25/24.0, 'minute':1.0/365.25/24.0/60.0
}

calc_vals['has_max_age'] = ~(calc_vals['maximum_age_num'].isna())
calc_vals['has_min_age'] = ~(calc_vals['minimum_age_num'].isna())

avg_max_years = calc_vals['maximum_age_num'][calc_vals['maximum_age_unit'] == 'year'].mean()
calc_vals['maximum_age_unit'] = calc_vals['maximum_age_unit'].replace(np.nan, 'year')
calc_vals['maximum_age_num'] = calc_vals['maximum_age_num'].replace(np.nan, avg_max_years)

avg_min_years = calc_vals['minimum_age_num'][calc_vals['minimum_age_unit'] == 'year'].mean()
calc_vals['minimum_age_unit'] =  calc_vals['minimum_age_unit'].replace(np.nan, 'year')
calc_vals['minimum_age_num'] = calc_vals['minimum_age_num'].replace(np.nan, avg_min_years)

calc_vals['max_age'] = calc_vals['maximum_age_unit'].apply(lambda x: scalers[x]) * calc_vals['maximum_age_num']
calc_vals['min_age'] = calc_vals['minimum_age_unit'].apply(lambda x: scalers[x]) * calc_vals['minimum_age_num']

calc_vals.drop(columns=['maximum_age_num', 'maximum_age_unit', 'minimum_age_num', 'minimum_age_unit'], inplace=True)

In [4]:
designs = pd.read_csv(raw/'designs.txt', sep='|')
design_cols = [
    'nct_id', 'intervention_model', 'observational_model', 'primary_purpose'
]

designs = designs[design_cols]
# designs = designs[~(designs['intervention_model'].isna() & designs['observational_model'].isna())]

designs['model'] = [
    designs.iloc[i]['observational_model'] if not pd.isna(designs.iloc[i]['observational_model']) 
    else designs.iloc[i]['intervention_model']
    for i in range(designs.shape[0])
]
designs.drop(columns=['observational_model', 'intervention_model'], inplace=True)

In [5]:
countries = pd.read_csv(raw/'countries.txt', sep='|')
countries = pd.DataFrame(countries.groupby('nct_id')['name'].nunique()).reset_index()
countries.columns = ['nct_id', 'num_countries']

In [6]:
sponsors = pd.read_csv(raw/'sponsors.txt', sep='|')
sponsors = sponsors[sponsors['lead_or_collaborator'] == 'lead'][['nct_id', 'agency_class']]

In [7]:
final_table = studies
for table in [calc_vals, designs, countries, sponsors]:
    final_table = pd.merge(final_table, table, how='left', on='nct_id')
final_table['terminated'] = final_table['overall_status'] == 'TERMINATED'
final_table.drop(columns='overall_status', inplace=True)

In [8]:
final_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338720 entries, 0 to 338719
Data columns (total 22 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   nct_id                                   338720 non-null  object 
 1   biospec_retention                        12553 non-null   object 
 2   is_ppsd                                  6 non-null       object 
 3   phase                                    142408 non-null  object 
 4   is_unapproved_device                     2323 non-null    object 
 5   is_fda_regulated_device                  159455 non-null  object 
 6   is_fda_regulated_drug                    159463 non-null  object 
 7   number_of_groups                         46366 non-null   float64
 8   number_of_arms                           248592 non-null  float64
 9   number_of_facilities                     338720 non-null  int64  
 10  has_us_facility                 

In [9]:
for col in final_table.columns:
    if final_table[col].dtype not in  [np.float64, np.int64]:
        print(col, ":", final_table[col].unique())

nct_id : ['NCT06790056' 'NCT01256333' 'NCT03980301' ... 'NCT02999490' 'NCT03079479'
 'NCT02026063']
biospec_retention : [nan 'SAMPLES_WITH_DNA' 'SAMPLES_WITHOUT_DNA' 'NONE_RETAINED']
is_ppsd : [nan 't']
phase : [nan 'PHASE2' 'PHASE4' 'PHASE2/PHASE3' 'PHASE1' 'PHASE3' 'EARLY_PHASE1'
 'PHASE1/PHASE2']
is_unapproved_device : [nan 't']
is_fda_regulated_device : ['f' nan 't']
is_fda_regulated_drug : ['f' nan 't']
has_us_facility : ['f' 't' nan]
has_max_age : [ True False]
has_min_age : [ True False]
primary_purpose : [nan 'TREATMENT' 'HEALTH_SERVICES_RESEARCH' 'SUPPORTIVE_CARE'
 'BASIC_SCIENCE' 'PREVENTION' 'DIAGNOSTIC' 'OTHER' 'SCREENING' 'ECT'
 'DEVICE_FEASIBILITY']
model : ['COHORT' 'CASE_ONLY' 'PARALLEL' 'SINGLE_GROUP' 'OTHER' 'CROSSOVER'
 'SEQUENTIAL' nan 'CASE_CONTROL' 'FACTORIAL' 'FAMILY_BASED'
 'ECOLOGIC_OR_COMMUNITY' 'CASE_CROSSOVER' 'DEFINED_POPULATION'
 'NATURAL_HISTORY']
agency_class : ['OTHER' 'OTHER_GOV' 'INDUSTRY' 'FED' 'NIH' 'NETWORK' 'INDIV' 'UNKNOWN'
 'AMBIG']
terminated :

In [10]:
final_table['biospec_retention'] = final_table['biospec_retention'].replace(np.nan, 'UNKNOWN')
final_table['is_ppsd'] = final_table['is_ppsd'] == 't'
final_table['phase'] = final_table['phase'].replace(np.nan, 'NONE')
final_table['is_unapproved_device'] = final_table['is_unapproved_device'] == 't'
final_table['is_fda_regulated_device'] = final_table['is_fda_regulated_device'].replace(np.nan, 'u')
final_table['is_fda_regulated_drug'] = final_table['is_fda_regulated_drug'].replace(np.nan, 'u')
final_table['has_us_facility'] = final_table['has_us_facility'].replace(np.nan, 'u')
final_table['primary_purpose'] = final_table['primary_purpose'].replace(np.nan, 'UNKNOWN')
final_table['model'] = final_table['model'].replace(np.nan, 'UNKNOWN')

In [11]:
for col in final_table.columns:
    if final_table[col].dtype not in  [np.float64, np.int64]:
        print(col, ":", final_table[col].unique())

nct_id : ['NCT06790056' 'NCT01256333' 'NCT03980301' ... 'NCT02999490' 'NCT03079479'
 'NCT02026063']
biospec_retention : ['UNKNOWN' 'SAMPLES_WITH_DNA' 'SAMPLES_WITHOUT_DNA' 'NONE_RETAINED']
is_ppsd : [False  True]
phase : ['NONE' 'PHASE2' 'PHASE4' 'PHASE2/PHASE3' 'PHASE1' 'PHASE3' 'EARLY_PHASE1'
 'PHASE1/PHASE2']
is_unapproved_device : [False  True]
is_fda_regulated_device : ['f' 'u' 't']
is_fda_regulated_drug : ['f' 'u' 't']
has_us_facility : ['f' 't' 'u']
has_max_age : [ True False]
has_min_age : [ True False]
primary_purpose : ['UNKNOWN' 'TREATMENT' 'HEALTH_SERVICES_RESEARCH' 'SUPPORTIVE_CARE'
 'BASIC_SCIENCE' 'PREVENTION' 'DIAGNOSTIC' 'OTHER' 'SCREENING' 'ECT'
 'DEVICE_FEASIBILITY']
model : ['COHORT' 'CASE_ONLY' 'PARALLEL' 'SINGLE_GROUP' 'OTHER' 'CROSSOVER'
 'SEQUENTIAL' 'UNKNOWN' 'CASE_CONTROL' 'FACTORIAL' 'FAMILY_BASED'
 'ECOLOGIC_OR_COMMUNITY' 'CASE_CROSSOVER' 'DEFINED_POPULATION'
 'NATURAL_HISTORY']
agency_class : ['OTHER' 'OTHER_GOV' 'INDUSTRY' 'FED' 'NIH' 'NETWORK' 'INDIV' 'UN

In [12]:
final_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338720 entries, 0 to 338719
Data columns (total 22 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   nct_id                                   338720 non-null  object 
 1   biospec_retention                        338720 non-null  object 
 2   is_ppsd                                  338720 non-null  bool   
 3   phase                                    338720 non-null  object 
 4   is_unapproved_device                     338720 non-null  bool   
 5   is_fda_regulated_device                  338720 non-null  object 
 6   is_fda_regulated_drug                    338720 non-null  object 
 7   number_of_groups                         46366 non-null   float64
 8   number_of_arms                           248592 non-null  float64
 9   number_of_facilities                     338720 non-null  int64  
 10  has_us_facility                 

In [13]:
final_table

Unnamed: 0,nct_id,biospec_retention,is_ppsd,phase,is_unapproved_device,is_fda_regulated_device,is_fda_regulated_drug,number_of_groups,number_of_arms,number_of_facilities,...,number_of_secondary_outcomes_to_measure,has_max_age,has_min_age,max_age,min_age,primary_purpose,model,num_countries,agency_class,terminated
0,NCT06790056,UNKNOWN,False,NONE,False,f,f,1.0,,1,...,1.0,True,True,65.00000,18.000000,UNKNOWN,COHORT,1.0,OTHER,False
1,NCT01256333,UNKNOWN,False,NONE,False,u,u,,,2,...,4.0,False,True,60.40191,18.000000,UNKNOWN,CASE_ONLY,1.0,OTHER,False
2,NCT03980301,UNKNOWN,False,NONE,False,f,f,,,1,...,,True,True,45.00000,18.000000,UNKNOWN,COHORT,1.0,OTHER,False
3,NCT02689219,UNKNOWN,False,PHASE2,False,f,t,,2.0,4,...,3.0,False,True,60.40191,18.000000,TREATMENT,PARALLEL,1.0,OTHER,True
4,NCT06101732,UNKNOWN,False,NONE,False,f,f,,,1,...,11.0,False,True,60.40191,18.000000,UNKNOWN,COHORT,1.0,OTHER,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338715,NCT04798287,UNKNOWN,False,NONE,False,f,t,2.0,,1,...,8.0,False,True,60.40191,18.000000,UNKNOWN,COHORT,1.0,OTHER,False
338716,NCT01466400,UNKNOWN,False,NONE,False,u,u,1.0,,1,...,,True,True,0.50000,0.166667,UNKNOWN,UNKNOWN,1.0,OTHER,False
338717,NCT02999490,UNKNOWN,False,NONE,False,u,u,,2.0,1,...,,True,True,75.00000,10.000000,DIAGNOSTIC,PARALLEL,1.0,OTHER,False
338718,NCT03079479,UNKNOWN,False,NONE,False,u,u,4.0,,1,...,,True,True,70.00000,56.000000,UNKNOWN,OTHER,1.0,OTHER,False


In [14]:
final_table.shape

(338720, 22)

In [15]:
final_table['terminated'].mean()

np.float64(0.0956748937175248)

In [16]:
final_table.isna().mean().sort_values(ascending=False).head(15)

number_of_groups                           0.863114
number_of_secondary_outcomes_to_measure    0.278330
number_of_arms                             0.266084
num_countries                              0.062624
number_of_primary_outcomes_to_measure      0.036963
agency_class                               0.000000
model                                      0.000000
primary_purpose                            0.000000
min_age                                    0.000000
max_age                                    0.000000
has_min_age                                0.000000
has_max_age                                0.000000
nct_id                                     0.000000
biospec_retention                          0.000000
has_us_facility                            0.000000
dtype: float64

In [17]:
final_table['nct_id'].is_unique

True

In [19]:
final_table.to_csv('../data/processed/trials.csv', index=False)