# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
nct_id                            50 non-null object
study_first_submitted             50 non-null object
source                            50 non-null object
brief_title                       50 non-null object
condition                         50 non-null object
detailed_description/textblock    50 non-null object
brief_summary/textblock           50 non-null object
dtypes: object(7)
memory usage: 2.8+ KB


In [7]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT02750033,"April 20, 2016",Seton Healthcare Family,Intraoperative Margin Assessment During Mohs S...,Squamous Cell Carcinoma (SCC),\n Acquire intraoperative MMS measurement...,\n The research team will develop an intr...
1,NCT01587092,"April 25, 2012",Pennington Biomedical Research Center,Workstation Pilot Study,Obesity,\n Eligible participants will be randomiz...,\n This is a pilot study to assess the fe...
2,NCT01813695,"March 11, 2013","Children's Hospital Medical Center, Cincinnati",Preemptive Genotyping and Pain Management,Pain,\n Purpose: To determine the feasibility ...,\n The purpose of this study is to see if...
3,NCT03647059,"August 9, 2018","Shanghai General Hospital, Shanghai Jiao Tong ...",Rapid Assessment of Donor Liver Quality,"Transplant; Failure, Liver",,\n Aims：\n\n 1. Conduct multi-cent...
4,NCT00038025,"May 24, 2002",M.D. Anderson Cancer Center,A Study Of Deoxycoformycin(DCF)/Pentostatin In...,Peripheral T-cell Lymphoma,\n Deoxycoformycin(DCF)/Pentostatin is a ...,\n The purpose of this study is to determ...


In [8]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [9]:
df.head(5)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT02750033,"April 20, 2016",Seton Healthcare Family,Intraoperative Margin Assessment During Mohs S...,Squamous Cell Carcinoma (SCC),Acquire intraoperative MMS measurements...,The research team will develop an intra...
1,NCT01587092,"April 25, 2012",Pennington Biomedical Research Center,Workstation Pilot Study,Obesity,Eligible participants will be randomize...,This is a pilot study to assess the fea...
2,NCT01813695,"March 11, 2013","Children's Hospital Medical Center, Cincinnati",Preemptive Genotyping and Pain Management,Pain,Purpose: To determine the feasibility o...,The purpose of this study is to see if ...
3,NCT03647059,"August 9, 2018","Shanghai General Hospital, Shanghai Jiao Tong ...",Rapid Assessment of Donor Liver Quality,"Transplant; Failure, Liver",,"Aims： 1. Conduct multi-center,..."
4,NCT00038025,"May 24, 2002",M.D. Anderson Cancer Center,A Study Of Deoxycoformycin(DCF)/Pentostatin In...,Peripheral T-cell Lymphoma,Deoxycoformycin(DCF)/Pentostatin is a T...,The purpose of this study is to determi...


In [11]:
# Rename columns
df.columns = ['id', 'submission_date','source', 'brief_title', 'condition', 'full_description', 'summary']

In [12]:
df.head()

Unnamed: 0,id,submission_date,source,brief_title,condition,full_description,summary
0,NCT02750033,"April 20, 2016",Seton Healthcare Family,Intraoperative Margin Assessment During Mohs S...,Squamous Cell Carcinoma (SCC),Acquire intraoperative MMS measurements...,The research team will develop an intra...
1,NCT01587092,"April 25, 2012",Pennington Biomedical Research Center,Workstation Pilot Study,Obesity,Eligible participants will be randomize...,This is a pilot study to assess the fea...
2,NCT01813695,"March 11, 2013","Children's Hospital Medical Center, Cincinnati",Preemptive Genotyping and Pain Management,Pain,Purpose: To determine the feasibility o...,The purpose of this study is to see if ...
3,NCT03647059,"August 9, 2018","Shanghai General Hospital, Shanghai Jiao Tong ...",Rapid Assessment of Donor Liver Quality,"Transplant; Failure, Liver",,"Aims： 1. Conduct multi-center,..."
4,NCT00038025,"May 24, 2002",M.D. Anderson Cancer Center,A Study Of Deoxycoformycin(DCF)/Pentostatin In...,Peripheral T-cell Lymphoma,Deoxycoformycin(DCF)/Pentostatin is a T...,The purpose of this study is to determi...


## Add new date columns

In [13]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['submission_date'])

CPU times: user 9.58 ms, sys: 300 µs, total: 9.88 ms
Wall time: 9.77 ms


In [14]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [15]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [16]:
df.head()

Unnamed: 0,id,submission_date,source,brief_title,condition,full_description,summary,full_date,year
20,NCT00002734,"November 1, 1999",National Cancer Institute (NCI),"Radiolabeled Monoclonal Antibody, Paclitaxel, ...",Ovarian Cancer,OBJECTIVES: I. Determine the max...,Phase I trial to study the effectivenes...,1999-11-01,1999
47,NCT00011284,"February 15, 2001",National Institute of Environmental Health Sci...,Mechanisms of Inflammatory Liver Injury,Liver Diseases,Neutrophils will be isolated from norma...,White blood cells can cause liver damag...,2001-02-15,2001
16,NCT00023504,"September 7, 2001",National Institutes of Health Clinical Center ...,Antibody Production in Immune Disorders,Immunologic Disease,The purpose of this study is to evaluat...,This study will evaluate immune functio...,2001-09-07,2001
4,NCT00038025,"May 24, 2002",M.D. Anderson Cancer Center,A Study Of Deoxycoformycin(DCF)/Pentostatin In...,Peripheral T-cell Lymphoma,Deoxycoformycin(DCF)/Pentostatin is a T...,The purpose of this study is to determi...,2002-05-24,2002
28,NCT00253292,"November 11, 2005",Massachusetts General Hospital,Psychometric Study of Outcomes Instruments,Burns,Identifying young adult burned patients...,The purpose of this study is to develop...,2005-11-11,2005


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 20 to 17
Data columns (total 9 columns):
id                  50 non-null object
submission_date     50 non-null object
source              50 non-null object
brief_title         50 non-null object
condition           50 non-null object
full_description    50 non-null object
summary             50 non-null object
full_date           50 non-null datetime64[ns]
year                50 non-null int64
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 3.9+ KB


In [17]:
df.memory_usage()

Index               400
id                  400
submission_date     400
source              400
brief_title         400
condition           400
full_description    400
summary             400
full_date           400
year                400
dtype: int64

## Optimize memory

In [None]:
# change data types

## Add new columns based on MESH terms

## Export dataframe as csv

In [None]:
# Export dataframe as csv file

path_to_csv = os.path.abspath('../data/csv/')

try:
    os.mkdir(path_to_csv)
    print('{} created'.format(csv))
except IOError as e:
    print(e)
    pass




In [None]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)