# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286328 entries, 0 to 286327
Data columns (total 8 columns):
nct_id                            286328 non-null object
study_first_submitted             286328 non-null object
source                            286328 non-null object
brief_title                       286328 non-null object
condition                         286328 non-null object
url                               286328 non-null object
detailed_description/textblock    286328 non-null object
brief_summary/textblock           286328 non-null object
dtypes: object(8)
memory usage: 17.5+ MB


In [7]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,url,detailed_description/textblock,brief_summary/textblock
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",,\n To compare the activities (the progres...,\n This study was conducted to compare th...
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,,\n This study will examine the effects of...,\n The Brain Energy for Amyloid Transform...
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),,"\n Demetri and colleagues presented, at t...",\n This observational study is proposed t...
3,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,,\n To test the efficacy and safety of les...,\n Lesogaberan may be used in Chinese GER...
4,NCT00092989,"September 28, 2004",Merck Sharp & Dohme Corp.,Investigation of Intravenous (IV) Administrati...,Asthma,,,\n The purpose of this study is to evalua...


In [8]:
# DELETE URL COLUMN

print(df.url.describe())
df = df.drop(columns = ['url'])

count     286328
unique         1
top         None
freq      286328
Name: url, dtype: object


In [9]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [10]:
df.head(5)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",To compare the activities (the progress...,This study was conducted to compare the...
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,This study will examine the effects of ...,The Brain Energy for Amyloid Transforma...
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),"Demetri and colleagues presented, at th...",This observational study is proposed to...
3,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,To test the efficacy and safety of leso...,Lesogaberan may be used in Chinese GERD...
4,NCT00092989,"September 28, 2004",Merck Sharp & Dohme Corp.,Investigation of Intravenous (IV) Administrati...,Asthma,,The purpose of this study is to evaluat...


In [11]:
# Rename columns
df.columns = ['id', 'original_date','source', 'brief_title', 'condition', 'full_description', 'summary']

In [12]:
df.head()

Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",To compare the activities (the progress...,This study was conducted to compare the...
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,This study will examine the effects of ...,The Brain Energy for Amyloid Transforma...
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),"Demetri and colleagues presented, at th...",This observational study is proposed to...
3,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,To test the efficacy and safety of leso...,Lesogaberan may be used in Chinese GERD...
4,NCT00092989,"September 28, 2004",Merck Sharp & Dohme Corp.,Investigation of Intravenous (IV) Administrati...,Asthma,,The purpose of this study is to evaluat...


## Add new date columns

In [13]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['original_date'])

CPU times: user 35.3 s, sys: 305 ms, total: 35.6 s
Wall time: 37.7 s


In [14]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [15]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [31]:
df.head()

Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary,full_date,year
46038,NCT00004640,"September 17, 1999",University of Washington,"""Clinical Trials to Enhance Elders' Oral Healt...",Tooth Loss,"""TEETH"" is a double-blinded, randomized...",The purpose of this study is to determi...,1999-09-17,1999
150757,NCT00004639,"September 17, 1999",University of Florida,Cleft Palate Surgery and Speech Development,Cleft Lip,This study is conducted with patients w...,Compare the outcome of two primary surg...,1999-09-17,1999
43264,NCT00000341,"September 20, 1999",National Institute on Drug Abuse (NIDA),Evaluation of Liquid vs. Tablet Buprenorphine - 6,Opioid-Related Disorders,,The purpose of this study is to evaluat...,1999-09-20,1999
195656,NCT00000289,"September 20, 1999",National Institute on Drug Abuse (NIDA),Role of Metabolites in Nicotine Dependence (3)...,Tobacco Use Disorder,Previous studies have shown that cotini...,The purpose of this study is to determi...,1999-09-20,1999
528,NCT00000227,"September 20, 1999",University of Vermont,Alternate-Day Buprenorphine Administration. Ph...,Opioid-Related Disorders,,The purpose of this study is to evaluat...,1999-09-20,1999


In [34]:
df.memory_usage()

Index               2290624
id                  2290624
original_date       2290624
source              2290624
brief_title         2290624
condition           2290624
full_description    2290624
summary             2290624
full_date           2290624
year                2290624
dtype: int64

## Export dataframe as csv

In [23]:
# Export dataframe as csv file

path_to_csv = os.path.abspath('../data/csv/')

try:
    os.mkdir(csv)
    print('{} created'.format(csv))
except IOError as e:
    print(e)
    pass




[Errno 17] File exists: '/Users/cmserna/Sites/clinical trials/mvp/data/csv'


In [29]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)