In [1]:
import pandas as pd
import numpy as np
import pathlib

raw = pathlib.Path('../data/raw')

In [2]:
studies = pd.read_csv(raw/'studies.txt', sep='|')
studies_cols = [
    'nct_id', 'biospec_retention', 'is_ppsd',
    'is_unapproved_device', 'is_fda_regulated_device', 'is_fda_regulated_drug',
    'number_of_groups', 'number_of_arms', 'overall_status'
]

studies = studies[studies_cols]
studies = studies[studies['overall_status'].isin(['COMPLETED', 'TERMINATED'])]

  studies = pd.read_csv(raw/'studies.txt', sep='|')


In [3]:
calc_vals = pd.read_csv(raw/'calculated_values.txt', sep='|')
calc_vals_cols = [
    'nct_id', 'number_of_facilities', 'has_us_facility',
    'minimum_age_num', 'minimum_age_unit', 'maximum_age_num', 'maximum_age_unit',
    'number_of_primary_outcomes_to_measure', 'number_of_secondary_outcomes_to_measure'
]

calc_vals = calc_vals[calc_vals_cols]
calc_vals = calc_vals[~calc_vals['has_us_facility'].isna()]
calc_vals['has_us_facility'] = calc_vals['has_us_facility'] == 't'


scalers = {
    'year':1.0, 'month':1.0/12.0, 'week':7.0/365.25,
    'day':1.0/365.25, 'hour':1.0/365.25/24.0, 'minute':1.0/365.25/24.0/60.0
}

avg_max_years = calc_vals['maximum_age_num'][calc_vals['maximum_age_unit'] == 'year'].mean()
calc_vals['maximum_age_unit'] = calc_vals['maximum_age_unit'].replace(np.nan, 'year')
calc_vals['maximum_age_num'] = calc_vals['maximum_age_num'].replace(np.nan, avg_max_years)

avg_min_years = calc_vals['minimum_age_num'][calc_vals['minimum_age_unit'] == 'year'].mean()
calc_vals['minimum_age_unit'] =  calc_vals['minimum_age_unit'].replace(np.nan, 'year')
calc_vals['minimum_age_num'] = calc_vals['minimum_age_num'].replace(np.nan, avg_min_years)

calc_vals['max_age'] = calc_vals['maximum_age_unit'].apply(lambda x: scalers[x]) * calc_vals['maximum_age_num']
calc_vals['min_age'] = calc_vals['minimum_age_unit'].apply(lambda x: scalers[x]) * calc_vals['minimum_age_num']

calc_vals.drop(columns=['maximum_age_num', 'maximum_age_unit', 'minimum_age_num', 'minimum_age_unit'], inplace=True)

In [4]:
designs = pd.read_csv(raw/'designs.txt', sep='|')
design_cols = [
    'nct_id', 'intervention_model', 'observational_model', 'primary_purpose'
]

designs = designs[design_cols]
designs = designs[~(designs['intervention_model'].isna() & designs['observational_model'].isna())]

designs['model'] = [
    designs.iloc[i]['observational_model'] if pd.isna(designs.iloc[i]['intervention_model']) 
    else designs.iloc[i]['intervention_model']
    for i in range(designs.shape[0])
]
designs.drop(columns=['observational_model', 'intervention_model'], inplace=True)

In [5]:
countries = pd.read_csv(raw/'countries.txt', sep='|')
countries = pd.DataFrame(countries.groupby('nct_id')['name'].nunique()).reset_index()
countries.columns = ['nct_id', 'num_countries']

In [8]:
print(studies.shape[0])
final_table = pd.merge(studies, calc_vals, how='inner', on='nct_id')
print(final_table.shape[0])
final_table = pd.merge(final_table, designs, how='inner', on='nct_id')
print(final_table.shape[0])
final_table = pd.merge(final_table, countries, how='inner', on='nct_id')
print(final_table.shape[0])
final_table['terminated'] = final_table['overall_status'] == 'TERMINATED'
final_table.drop(columns='overall_status', inplace=True)

338720
313674
305093
305093


In [7]:
final_table

Unnamed: 0,nct_id,biospec_retention,is_ppsd,is_unapproved_device,is_fda_regulated_device,is_fda_regulated_drug,number_of_groups,number_of_arms,number_of_facilities,has_us_facility,number_of_primary_outcomes_to_measure,number_of_secondary_outcomes_to_measure,max_age,min_age,primary_purpose,model,num_countries,terminated
0,NCT06790056,,,,f,f,1.0,,1,False,1.0,1.0,65.000000,18.0,,COHORT,1,False
1,NCT01256333,,,,,,,,2,False,1.0,4.0,60.444343,18.0,,CASE_ONLY,1,False
2,NCT03980301,,,,f,f,,,1,False,1.0,,45.000000,18.0,,COHORT,1,False
3,NCT02689219,,,,f,t,,2.0,4,True,1.0,3.0,60.444343,18.0,TREATMENT,PARALLEL,1,True
4,NCT06101732,,,,f,f,,,1,False,1.0,11.0,60.444343,18.0,,COHORT,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305088,NCT03618056,,,,f,t,,1.0,1,True,7.0,10.0,50.000000,18.0,PREVENTION,SINGLE_GROUP,1,False
305089,NCT04798287,,,,f,t,2.0,,1,True,1.0,8.0,60.444343,18.0,,COHORT,1,False
305090,NCT02999490,,,,,,,2.0,1,False,1.0,,75.000000,10.0,DIAGNOSTIC,PARALLEL,1,False
305091,NCT03079479,,,,,,4.0,,1,False,1.0,,70.000000,56.0,,OTHER,1,False
