# Clinical trials: Dataframe setup


In [2]:
import time
import json
import os
import pandas as pd

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/sample_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 999
Data columns (total 24 columns):
nct_id                                  3000 non-null object
study_first_submitted                   3000 non-null object
source                                  3000 non-null object
brief_title                             3000 non-null object
overall_status                          3000 non-null object
verification_date                       2991 non-null object
study_type                              3000 non-null object
study_first_posted                      3000 non-null object
last_update_submitted                   3000 non-null object
last_update_posted                      3000 non-null object
phase                                   2432 non-null object
condition                               3000 non-null object
condition_browse/mesh_term              3000 non-null object
intervention_browse/mesh_term           3000 non-null object
detailed_description/textblock          

In [30]:
# print(df.head())
# print(list(df.columns))
len(df.columns)

24

In [31]:
df = df.replace(r'\n',' ', regex=True)

#rename columns
df.columns = ['ntc_id', 
          'study_first_submitted',
          'source', 
          'brief_title',
          'overall_status'
          'verification_date',
          'study_type',
          'study_first_posted',
          'last_update_submitted',
          'last_update_posted',
          'phase',
          'condition',
          'condition_mesh_term',
          'intervention_mesh_term',
          'detailed_description',
          'brief_summary',
          'city',
          'country',
          'zip',
          'lead_sponsor_agency',
          'lead_sponsor_agency_class',
          'study_design_info_allocation',
          'study_design_info_intervention_model',
          'study_design_info_primary_purpose', 
          'extra_column']

In [32]:
# df.set_index('nct_id', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 999
Data columns (total 24 columns):
ntc_id                                  3000 non-null object
study_first_submitted                   3000 non-null object
source                                  3000 non-null object
brief_title                             3000 non-null object
overall_statusverification_date         3000 non-null object
study_type                              2991 non-null object
study_first_posted                      3000 non-null object
last_update_submitted                   3000 non-null object
last_update_posted                      3000 non-null object
phase                                   3000 non-null object
condition                               2432 non-null object
condition_mesh_term                     3000 non-null object
intervention_mesh_term                  3000 non-null object
detailed_description                    3000 non-null object
brief_summary                           

In [33]:
df.head()

Unnamed: 0,ntc_id,study_first_submitted,source,brief_title,overall_statusverification_date,study_type,study_first_posted,last_update_submitted,last_update_posted,phase,...,brief_summary,city,country,zip,lead_sponsor_agency,lead_sponsor_agency_class,study_design_info_allocation,study_design_info_intervention_model,study_design_info_primary_purpose,extra_column
0,NCT01980602,"September 24, 2013",University of Hull,Effect of Exercise on Patients With Claudicati...,Unknown status,November 2013,Interventional,"November 11, 2013","November 4, 2013","November 11, 2013",...,,Title: How does exercise improve the ca...,Hull,United Kingdom,HU3 2JZ,University of Hull,Other,Randomized,Single Group Assignment,Treatment
1,NCT02181140,"July 8, 2013",Universitätsklinikum Hamburg-Eppendorf,Core Biopsy Endo Sonography Study Evaluation o...,Completed,August 2015,Interventional,"July 3, 2014","August 15, 2015","September 16, 2015",...,Endoscopic ultrasound is an established...,The study is designed to evaluate the d...,Hamburg,Germany,20246,Universitätsklinikum Hamburg-Eppendorf,Other,,Single Group Assignment,Diagnostic
10,NCT00548080,"October 19, 2007",Merck Sharp & Dohme Corp.,Evaluate the Safety and Efficacy of Caspofungi...,Completed,December 2015,Interventional,"October 23, 2007","December 16, 2015","December 17, 2015",...,,Registration study,,,,Merck Sharp & Dohme Corp.,Industry,Non-Randomized,Single Group Assignment,Treatment
100,NCT01920516,"August 6, 2013",International Group of Endovascular Oncology,Isolated Limb Perfusion of Melphalan for Melan...,Recruiting,February 2019,Observational,"August 12, 2013","February 26, 2019","February 27, 2019",...,This is an observational study and the ...,In-transit metastases occur in approxim...,Pesaro,Italy,61122,International Group of Endovascular Oncology,Other,,,
1000,NCT01147393,"June 16, 2010",Weill Medical College of Cornell University,Combination Veltuzumab and Fractionated 90Y- E...,Terminated,September 2008,Interventional,"June 22, 2010","January 26, 2018","February 21, 2018",...,The treatment regimen consists of 2 ele...,A Phase I/II clinical trial using a fra...,New York,United States,10065,Weill Medical College of Cornell University,Other,,Single Group Assignment,Treatment


## This part goes in the next notebook/script
- Preparing data for ML analysis

## Add new date columns

In [None]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['submission_date'])

In [None]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [None]:
# Delete submission date column
df.drop('submission_date', axis=1, inplace=True)

In [None]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [None]:
df.head()

In [None]:
df.info()

## Subset of data

In [None]:
def filter_data(year, df):
    # Select data since 2008
    df = [df['year'] > year]

In [None]:
# Select data since 2008
# df = df[df['year'] > 2007]
# df.info()

filter_data(2008, df)

In [None]:
df.mesh_term_condition.nunique()

In [None]:
df.condition.nunique()

In [None]:
df.mesh_term_intervention.nunique()

## Export dataframe as csv

In [None]:
def export_data_csv(df, path_to_csv = os.path.abspath('../data/csv/'), csv_file = '/clean_data.csv'):
    try:
        os.mkdir(path_to_csv)
        print('{} created'.format(csv))
    except IOError as e:
        print(e)
        pass
    
    df.to_csv(path_to_csv + csv_file)

In [None]:
# Export dataframe as csv file

# path_to_csv = os.path.abspath('../data/csv/')

# try:
#     os.mkdir(path_to_csv)
#     print('{} created'.format(csv))
# except IOError as e:
#     print(e)
#     pass

export_data_csv(df)

In [None]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)