After running the scrapers in Notebook 3 and manually retrieving additional data from the registries from a small number of trials, we combined this data in an excel sheet `registry_data.xlsx` detailing the data collected and where it came from. This notebook will clean up this data in a way we can use to be merged into the final dataset

In [1]:
import sys
from pathlib import Path
import os
cwd = os.getcwd()
parent = str(Path(cwd).parents[0])
sys.path.append(parent)

In [2]:
import pandas as pd
import re
import numpy as np

In [4]:
reg = pd.read_excel(parent + '/data/registry_data/registry_data.xlsx', sheet_name='Full')

ictrp = pd.read_csv(parent + '/data/cleaned_ictrp_16Dec2020.csv')

In [5]:
reg.columns

Index(['trial_id', 'trial_status', 'pcd', 'scd', 'reg_results_status',
       'other_results_1', 'other_results_2'],
      dtype='object')

In [6]:
ictrp.head()

Unnamed: 0,index,trialid,source_register,date_registration,date_enrollement,retrospective_registration,normed_spon_names,recruitment_status,phase,study_type,countries,public_title,study_category,intervention,intervention_list,target_enrollment,web_address,cross_registrations
0,0,ISRCTN18948812,ISRCTN,2014-04-09,2014-01-05,True,Covidien,Not Recruiting,Not Applicable,Interventional,United Kingdom,A post-market clinical study to evaluate the K...,Not relevant,Not relevant,Not relevant,100,http://isrctn.com/ISRCTN18948812,
1,1,NCT00223119,ClinicalTrials.gov,2005-09-13,2004-01-01,True,Covidien,Not Recruiting,Phase 4,Interventional,United States,Comparison of Absorbable Sutures in Perineal L...,Not relevant,Not relevant,Not relevant,Not Available,http://clinicaltrials.gov/show/NCT00223119,
2,2,NCT00223132,ClinicalTrials.gov,2005-09-13,2005-02-01,True,Covidien,Not Recruiting,Phase 4,Interventional,United States,Suture Granuloma in Body Contouring Surgery,Not relevant,Not relevant,Not relevant,Not Available,http://clinicaltrials.gov/show/NCT00223132,
3,3,NCT00289523,ClinicalTrials.gov,2006-02-08,2006-01-01,True,Covidien,Not Recruiting,Not Applicable,Observational,United States,EEG Biomarkers for Predicting Response to Anti...,Not relevant,Not relevant,Not relevant,375,http://clinicaltrials.gov/show/NCT00289523,
4,4,NCT00308984,ClinicalTrials.gov,2006-03-28,2006-01-01,True,Covidien,Not Recruiting,Not Applicable,Observational,United States,CARE--Childhood Awareness and Recall Evaluation,Not relevant,Not relevant,Not relevant,1784,http://clinicaltrials.gov/show/NCT00308984,


In [7]:
merged = reg.merge(ictrp[['trialid', 'web_address']], how='left', 
                   left_on='trial_id', right_on='trialid').drop('trialid', axis=1)

In [8]:
merged['pcd'] = pd.to_datetime(merged['pcd'], errors='coerce')
merged['scd'] = pd.to_datetime(merged['scd'], errors='coerce')

In [9]:
conditions = [merged.pcd.notnull(), (merged.pcd.isnull() & merged.scd.notnull()), (merged.pcd.isnull() & merged.scd.isnull())]
choices = [merged.pcd, merged.scd, None]

merged['relevent_comp_date'] = np.select(conditions, choices)
merged['relevent_comp_date'] = pd.to_datetime(merged['relevent_comp_date'], errors='coerce')

In [10]:
merged['tabular_results'] = np.where(merged.reg_results_status.isin(['Study Results', 'View results']), 1, 0)

In [11]:
filt_1 = merged.other_results_1.notnull()

filt_2 = merged.other_results_2.notnull()

In [12]:
merged['potential_other_results'] = np.where((filt_1 | filt_2), 1, 0)

In [16]:
merged.to_csv(parent + '/data/registry_data/registry_data_clean.csv')