# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
nct_id                            10000 non-null object
study_first_submitted             10000 non-null object
source                            10000 non-null object
brief_title                       10000 non-null object
condition                         10000 non-null object
detailed_description/textblock    10000 non-null object
brief_summary/textblock           10000 non-null object
dtypes: object(7)
memory usage: 547.0+ KB


In [7]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT00658203,"March 28, 2008",LivaNova,Clinical Evaluation on Advanced Resynchronization,Heart Failure,"\n The study is a prospective, multicentr...",\n The aim of the study is to compare cli...
1,NCT02842775,"July 11, 2016",National Taiwan University Hospital,Clinical Evaluation and Rehabilitation System ...,Cervical Myelopathy,\n Taiwan is quickly becoming a nation of...,\n Cervical myelopathy is common among th...
2,NCT03126968,"April 17, 2017",University of Maryland,Prophylactic Topical Epinephrine to Reduce Ble...,"Lung Transplant; Complications, Mechanical",\n The role of prophylactic topical epine...,\n Bleeding poses potential for significa...
3,NCT02411968,"April 3, 2015",Radboud University,Follow up After Cryoablation of Small Renal Ma...,"Carcinoma, Renal Cell",\n With the growing number of small renal...,\n Currently there is no consensus on pos...
4,NCT00775385,"October 17, 2008",Intergroupe Francophone de Cancerologie Thorac...,TAilored Post-Surgical Therapy in Early Stage ...,"Carcinoma, Non-Small-Cell Lung",,\n Our hypothesis is that patients receiv...


In [10]:
# DELETE URL COLUMN

# print(df.url.describe())
# # df = df.drop(columns = ['url'])

In [11]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [12]:
df.head(5)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT00658203,"March 28, 2008",LivaNova,Clinical Evaluation on Advanced Resynchronization,Heart Failure,"The study is a prospective, multicentre...",The aim of the study is to compare clin...
1,NCT02842775,"July 11, 2016",National Taiwan University Hospital,Clinical Evaluation and Rehabilitation System ...,Cervical Myelopathy,Taiwan is quickly becoming a nation of ...,Cervical myelopathy is common among the...
2,NCT03126968,"April 17, 2017",University of Maryland,Prophylactic Topical Epinephrine to Reduce Ble...,"Lung Transplant; Complications, Mechanical",The role of prophylactic topical epinep...,Bleeding poses potential for significan...
3,NCT02411968,"April 3, 2015",Radboud University,Follow up After Cryoablation of Small Renal Ma...,"Carcinoma, Renal Cell",With the growing number of small renal ...,Currently there is no consensus on post...
4,NCT00775385,"October 17, 2008",Intergroupe Francophone de Cancerologie Thorac...,TAilored Post-Surgical Therapy in Early Stage ...,"Carcinoma, Non-Small-Cell Lung",,Our hypothesis is that patients receivi...


In [13]:
# Rename columns
df.columns = ['id', 'original_date','source', 'brief_title', 'condition', 'full_description', 'summary']

In [14]:
df.head()

Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary
0,NCT00658203,"March 28, 2008",LivaNova,Clinical Evaluation on Advanced Resynchronization,Heart Failure,"The study is a prospective, multicentre...",The aim of the study is to compare clin...
1,NCT02842775,"July 11, 2016",National Taiwan University Hospital,Clinical Evaluation and Rehabilitation System ...,Cervical Myelopathy,Taiwan is quickly becoming a nation of ...,Cervical myelopathy is common among the...
2,NCT03126968,"April 17, 2017",University of Maryland,Prophylactic Topical Epinephrine to Reduce Ble...,"Lung Transplant; Complications, Mechanical",The role of prophylactic topical epinep...,Bleeding poses potential for significan...
3,NCT02411968,"April 3, 2015",Radboud University,Follow up After Cryoablation of Small Renal Ma...,"Carcinoma, Renal Cell",With the growing number of small renal ...,Currently there is no consensus on post...
4,NCT00775385,"October 17, 2008",Intergroupe Francophone de Cancerologie Thorac...,TAilored Post-Surgical Therapy in Early Stage ...,"Carcinoma, Non-Small-Cell Lung",,Our hypothesis is that patients receivi...


## Add new date columns

In [15]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['original_date'])

CPU times: user 931 ms, sys: 3.97 ms, total: 935 ms
Wall time: 935 ms


In [16]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [17]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [18]:
df.head()

Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary,full_date,year
2529,NCT00000278,"September 20, 1999",Yale University,Disulfiram for Cocaine-Alcohol Abuse - 3,Alcohol-Related Disorders,,The purpose of this study is to evaluat...,1999-09-20,1999
1482,NCT00000194,"September 20, 1999",Yale University,Neurobiology of Opioid Dependence: 3 - 3,Opioid-Related Disorders,,The purpose of this study is to study t...,1999-09-20,1999
4120,NCT00000337,"September 20, 1999",National Institute on Drug Abuse (NIDA),Infusion Laboratory: Protocol 1 - Selegeline - 2,Cocaine-Related Disorders,To develop a medication for the treatme...,The purpose of this study is to determi...,1999-09-20,1999
3062,NCT00000292,"September 20, 1999",National Institute on Drug Abuse (NIDA),Acute Withdrawal From Smoked Cocaine - 9,Cocaine-Related Disorders,Although there are clearly identifiable...,The purpose of this study is to develop...,1999-09-20,1999
428,NCT00004444,"October 18, 1999",FDA Office of Orphan Products Development,Pilot Randomized Study of Paromomycin (Aminosi...,"Tuberculosis, Pulmonary",PROTOCOL OUTLINE: This is a randomized ...,OBJECTIVES: I. Compare the pharmacokine...,1999-10-18,1999


In [19]:
df.memory_usage()

Index               80000
id                  80000
original_date       80000
source              80000
brief_title         80000
condition           80000
full_description    80000
summary             80000
full_date           80000
year                80000
dtype: int64

## Export dataframe as csv

In [20]:
# Export dataframe as csv file

path_to_csv = os.path.abspath('../data/csv/')

try:
    os.mkdir(csv)
    print('{} created'.format(csv))
except IOError as e:
    print(e)
    pass




NameError: name 'csv' is not defined

In [21]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)