# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292311 entries, 0 to 292310
Data columns (total 8 columns):
nct_id                            292311 non-null object
study_first_submitted             292311 non-null object
source                            292311 non-null object
brief_title                       292311 non-null object
condition                         292311 non-null object
url                               292311 non-null object
detailed_description/textblock    292311 non-null object
brief_summary/textblock           292311 non-null object
dtypes: object(8)
memory usage: 17.8+ MB


In [7]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,url,detailed_description/textblock,brief_summary/textblock
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",,\n To compare the activities (the progres...,\n This study was conducted to compare th...
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,,\n This study will examine the effects of...,\n The Brain Energy for Amyloid Transform...
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),,"\n Demetri and colleagues presented, at t...",\n This observational study is proposed t...
3,NCT03757312,"November 27, 2018",Nationwide Children's Hospital,A Pilot Study of Optic Nerve Ultrasound Follow...,Congenital Heart Disease,,,\n The purpose of the proposed study is t...
4,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,,\n To test the efficacy and safety of les...,\n Lesogaberan may be used in Chinese GER...


In [14]:
df.url.unique()

array(['None'], dtype=object)

In [15]:
# DELETE URL COLUMN
df = df.drop(columns = ['url'])

In [16]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [17]:
df.head(5)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",To compare the activities (the progress...,This study was conducted to compare the...
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,This study will examine the effects of ...,The Brain Energy for Amyloid Transforma...
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),"Demetri and colleagues presented, at th...",This observational study is proposed to...
3,NCT03757312,"November 27, 2018",Nationwide Children's Hospital,A Pilot Study of Optic Nerve Ultrasound Follow...,Congenital Heart Disease,,The purpose of the proposed study is to...
4,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,To test the efficacy and safety of leso...,Lesogaberan may be used in Chinese GERD...


In [18]:
# Rename columns
df.columns = ['id', 'submission_date','source', 'brief_title', 'condition', 'full_description', 'summary']

In [19]:
df.head()

Unnamed: 0,id,submission_date,source,brief_title,condition,full_description,summary
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",To compare the activities (the progress...,This study was conducted to compare the...
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,This study will examine the effects of ...,The Brain Energy for Amyloid Transforma...
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),"Demetri and colleagues presented, at th...",This observational study is proposed to...
3,NCT03757312,"November 27, 2018",Nationwide Children's Hospital,A Pilot Study of Optic Nerve Ultrasound Follow...,Congenital Heart Disease,,The purpose of the proposed study is to...
4,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,To test the efficacy and safety of leso...,Lesogaberan may be used in Chinese GERD...


## Add new date columns

In [21]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['submission_date'])

CPU times: user 32.5 s, sys: 27.7 ms, total: 32.5 s
Wall time: 32.6 s


In [22]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [23]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [24]:
df.head()

Unnamed: 0,id,submission_date,source,brief_title,condition,full_description,summary,full_date,year
153887,NCT00004639,"September 17, 1999",University of Florida,Cleft Palate Surgery and Speech Development,Cleft Lip,This study is conducted with patients w...,Compare the outcome of two primary surg...,1999-09-17,1999
46938,NCT00004640,"September 17, 1999",University of Washington,"""Clinical Trials to Enhance Elders' Oral Healt...",Tooth Loss,"""TEETH"" is a double-blinded, randomized...",The purpose of this study is to determi...,1999-09-17,1999
182804,NCT00000267,"September 20, 1999",National Institute on Drug Abuse (NIDA),Risperidone Treatment in Dually-Diagnosed Indi...,Cocaine-Related Disorders,,The purpose of this study is to evaluat...,1999-09-20,1999
73402,NCT00000244,"September 20, 1999",University of Minnesota - Clinical and Transla...,Effects of Dynorphin 1-13 on Heroin Addiction - 1,Opioid-Related Disorders,Randomized double blinded study of the ...,The purpose of this study is to evaluat...,1999-09-20,1999
73336,NCT00000250,"September 20, 1999",University of Chicago,Cold Water Immersion Modulates Reinforcing Eff...,Opioid-Related Disorders,,The purpose of this study is to conduct...,1999-09-20,1999


In [25]:
df.memory_usage()

Index               2338488
id                  2338488
submission_date     2338488
source              2338488
brief_title         2338488
condition           2338488
full_description    2338488
summary             2338488
full_date           2338488
year                2338488
dtype: int64

## Export dataframe as csv

In [27]:
# Export dataframe as csv file

path_to_csv = os.path.abspath('../data/csv/')

try:
    os.mkdir(path_to_csv)
    print('{} created'.format(csv))
except IOError as e:
    print(e)
    pass




[Errno 17] File exists: '/Users/cmserna/Sites/clinical trials/mvp/data/csv'


In [28]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)