# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd

In [2]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [5]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [4]:
# breaks with large json file
df = pd.read_json(file)

ValueError: arrays must all be same length

## Basic data cleaning

In [5]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292311 entries, 0 to 292310
Data columns (total 12 columns):
nct_id                               292311 non-null object
study_first_submitted                292311 non-null object
source                               292311 non-null object
brief_title                          292311 non-null object
condition                            292311 non-null object
condition_browse/mesh_term           292311 non-null object
intervention_browse/mesh_term        292311 non-null object
detailed_description/textblock       292311 non-null object
brief_summary/textblock              292311 non-null object
location/facility/address/city       292311 non-null object
location/facility/address/country    292311 non-null object
location/facility/address/zip        292311 non-null object
dtypes: object(12)
memory usage: 26.8+ MB


In [6]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",Carcinoma,Paclitaxel,\n To compare the activities (the progres...,\n This study was conducted to compare th...,Albuquerque,United States,87102
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,Alzheimer Disease,,\n This study will examine the effects of...,\n The Brain Energy for Amyloid Transform...,Winston-Salem,United States,27157
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),Gastrointestinal Stromal Tumors,,"\n Demetri and colleagues presented, at t...",\n This observational study is proposed t...,Candiolo,Italy,10060
3,NCT03757312,"November 27, 2018",Nationwide Children's Hospital,A Pilot Study of Optic Nerve Ultrasound Follow...,Congenital Heart Disease,Heart Diseases,,,\n The purpose of the proposed study is t...,Columbus,United States,43205
4,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,Gastroesophageal Reflux,Lesogaberan,\n To test the efficacy and safety of les...,\n Lesogaberan may be used in Chinese GER...,Taipei,Taiwan,100


In [7]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [None]:
# df.set_index('nct_id', inplace=True)

In [10]:
df.columns

Index(['nct_id', 'study_first_submitted', 'source', 'brief_title', 'condition',
       'condition_browse/mesh_term', 'intervention_browse/mesh_term',
       'detailed_description/textblock', 'brief_summary/textblock',
       'location/facility/address/city', 'location/facility/address/country',
       'location/facility/address/zip'],
      dtype='object')

In [11]:
# Rename columns
df.columns = ['id', 
              'submission_date',
              'source', 
              'brief_title', 
              'condition', 
              'mesh_term_condition', 
              'mesh_term_intervention', 
              'full_description', 
              'summary', 
              'city', 
              'country', 
              'zip']

In [12]:
df.head()

Unnamed: 0,id,submission_date,source,brief_title,condition,mesh_term_condition,mesh_term_intervention,full_description,summary,city,country,zip
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",Carcinoma,Paclitaxel,To compare the activities (the progress...,This study was conducted to compare the...,Albuquerque,United States,87102
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,Alzheimer Disease,,This study will examine the effects of ...,The Brain Energy for Amyloid Transforma...,Winston-Salem,United States,27157
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),Gastrointestinal Stromal Tumors,,"Demetri and colleagues presented, at th...",This observational study is proposed to...,Candiolo,Italy,10060
3,NCT03757312,"November 27, 2018",Nationwide Children's Hospital,A Pilot Study of Optic Nerve Ultrasound Follow...,Congenital Heart Disease,Heart Diseases,,,The purpose of the proposed study is to...,Columbus,United States,43205
4,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,Gastroesophageal Reflux,Lesogaberan,To test the efficacy and safety of leso...,Lesogaberan may be used in Chinese GERD...,Taipei,Taiwan,100


## Add new date columns

In [13]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['submission_date'])

CPU times: user 31.5 s, sys: 22.2 ms, total: 31.6 s
Wall time: 31.6 s


In [14]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [15]:
# Delete submission date column
df.drop('submission_date', axis=1, inplace=True)

In [16]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [17]:
df.head()

Unnamed: 0,id,source,brief_title,condition,mesh_term_condition,mesh_term_intervention,full_description,summary,city,country,zip,full_date,year
153887,NCT00004639,University of Florida,Cleft Palate Surgery and Speech Development,Cleft Lip,Cleft Lip,,This study is conducted with patients w...,Compare the outcome of two primary surg...,Gainesville,United States,32166,1999-09-17,1999
46938,NCT00004640,University of Washington,"""Clinical Trials to Enhance Elders' Oral Healt...",Tooth Loss,Periodontal Diseases,Chlorhexidine,"""TEETH"" is a double-blinded, randomized...",The purpose of this study is to determi...,Seattle,United States,98195-7134,1999-09-17,1999
182804,NCT00000267,National Institute on Drug Abuse (NIDA),Risperidone Treatment in Dually-Diagnosed Indi...,Cocaine-Related Disorders,Cocaine-Related Disorders,Risperidone,,The purpose of this study is to evaluat...,New York,United States,10032,1999-09-20,1999
73402,NCT00000244,University of Minnesota - Clinical and Transla...,Effects of Dynorphin 1-13 on Heroin Addiction - 1,Opioid-Related Disorders,"Behavior, Addictive",Heroin,Randomized double blinded study of the ...,The purpose of this study is to evaluat...,Minneapolis,United States,55415,1999-09-20,1999
73336,NCT00000250,University of Chicago,Cold Water Immersion Modulates Reinforcing Eff...,Opioid-Related Disorders,Disease,Nitrous Oxide,,The purpose of this study is to conduct...,Chicago,United States,60637,1999-09-20,1999


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292311 entries, 153887 to 45170
Data columns (total 13 columns):
id                        292311 non-null object
source                    292311 non-null object
brief_title               292311 non-null object
condition                 292311 non-null object
mesh_term_condition       292311 non-null object
mesh_term_intervention    292311 non-null object
full_description          292311 non-null object
summary                   292311 non-null object
city                      292311 non-null object
country                   292311 non-null object
zip                       292311 non-null object
full_date                 292311 non-null datetime64[ns]
year                      292311 non-null int64
dtypes: datetime64[ns](1), int64(1), object(11)
memory usage: 31.2+ MB


## Subset of data

In [21]:
def filter_data(year, df):
    # Select data since 2008
    df = [df['year'] > year]

In [22]:
# Select data since 2008
# df = df[df['year'] > 2007]
# df.info()

filter_data(2008, df)

In [23]:
df.mesh_term_condition.nunique()

2347

In [24]:
df.condition.nunique()

48389

In [None]:
df.mesh_term_intervention.nunique()

## Export dataframe as csv

In [28]:
def export_data_csv(df, path_to_csv = os.path.abspath('../data/csv/'), csv_file = '/clean_data.csv'):
    try:
        os.mkdir(path_to_csv)
        print('{} created'.format(csv))
    except IOError as e:
        print(e)
        pass
    
    df.to_csv(path_to_csv + csv_file)

In [29]:
# Export dataframe as csv file

# path_to_csv = os.path.abspath('../data/csv/')

# try:
#     os.mkdir(path_to_csv)
#     print('{} created'.format(csv))
# except IOError as e:
#     print(e)
#     pass

export_data_csv(df)

[Errno 17] File exists: '/Users/cmserna/Sites/clinical trials/mvp/data/csv'


In [None]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)