# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
nct_id                            1000 non-null object
study_first_submitted             1000 non-null object
source                            1000 non-null object
brief_title                       1000 non-null object
condition                         1000 non-null object
detailed_description/textblock    1000 non-null object
brief_summary/textblock           1000 non-null object
dtypes: object(7)
memory usage: 54.8+ KB


In [7]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT01995227,"November 18, 2013","Immunovative Therapies, Ltd.",An Individualized Anti-Cancer Vaccine Study in...,Hepatocellular Carcinoma,\n All accrued subjects will undergo tumo...,\n The purpose of this study is to determ...
1,NCT03186248,"June 9, 2017","Asian Institute of Gastroenterology, India",Randomized Clinical Trial Comparing Short Vers...,Achalasia Cardia,\n The primary goal of treatment of achal...,\n Aim of this study is to compare the ou...
2,NCT01612962,"May 30, 2012",Georgetown University,Diagnostic Tests to Help Determine Osteomyelitis,Osteomyelitis,\n Osteomyelitis is present in approximat...,"\n In this study, the investigators will ..."
3,NCT01257763,"December 8, 2010",Northwestern University,Tolerability Study of the Application of a 3M ...,Healthy,,\n The purpose of this study is to evalua...
4,NCT02599922,"November 5, 2015",Applied Genetic Technologies Corp,Safety and Efficacy Trial of AAV Gene Therapy ...,Achromatopsia,"\n This will be a non-randomized, open-la...","\n This will be a non-randomized, open-la..."


In [8]:
# DELETE URL COLUMN

# print(df.url.describe())
# # df = df.drop(columns = ['url'])

In [9]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [10]:
df.head(5)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT01995227,"November 18, 2013","Immunovative Therapies, Ltd.",An Individualized Anti-Cancer Vaccine Study in...,Hepatocellular Carcinoma,All accrued subjects will undergo tumor...,The purpose of this study is to determi...
1,NCT03186248,"June 9, 2017","Asian Institute of Gastroenterology, India",Randomized Clinical Trial Comparing Short Vers...,Achalasia Cardia,The primary goal of treatment of achala...,Aim of this study is to compare the out...
2,NCT01612962,"May 30, 2012",Georgetown University,Diagnostic Tests to Help Determine Osteomyelitis,Osteomyelitis,Osteomyelitis is present in approximate...,"In this study, the investigators will p..."
3,NCT01257763,"December 8, 2010",Northwestern University,Tolerability Study of the Application of a 3M ...,Healthy,,The purpose of this study is to evaluat...
4,NCT02599922,"November 5, 2015",Applied Genetic Technologies Corp,Safety and Efficacy Trial of AAV Gene Therapy ...,Achromatopsia,"This will be a non-randomized, open-lab...","This will be a non-randomized, open-lab..."


In [11]:
# Rename columns
df.columns = ['id', 'original_date','source', 'brief_title', 'condition', 'full_description', 'summary']

In [12]:
df.head()

Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary
0,NCT01995227,"November 18, 2013","Immunovative Therapies, Ltd.",An Individualized Anti-Cancer Vaccine Study in...,Hepatocellular Carcinoma,All accrued subjects will undergo tumor...,The purpose of this study is to determi...
1,NCT03186248,"June 9, 2017","Asian Institute of Gastroenterology, India",Randomized Clinical Trial Comparing Short Vers...,Achalasia Cardia,The primary goal of treatment of achala...,Aim of this study is to compare the out...
2,NCT01612962,"May 30, 2012",Georgetown University,Diagnostic Tests to Help Determine Osteomyelitis,Osteomyelitis,Osteomyelitis is present in approximate...,"In this study, the investigators will p..."
3,NCT01257763,"December 8, 2010",Northwestern University,Tolerability Study of the Application of a 3M ...,Healthy,,The purpose of this study is to evaluat...
4,NCT02599922,"November 5, 2015",Applied Genetic Technologies Corp,Safety and Efficacy Trial of AAV Gene Therapy ...,Achromatopsia,"This will be a non-randomized, open-lab...","This will be a non-randomized, open-lab..."


## Add new date columns

In [13]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['original_date'])

CPU times: user 111 ms, sys: 2.38 ms, total: 113 ms
Wall time: 115 ms


In [14]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [15]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [16]:
df.head()

Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary,full_date,year
461,NCT00004378,"October 18, 1999",National Center for Research Resources (NCRR),Stem Cell Transplantation (SCT) for Genetic Di...,Thrombocytopenia,PROTOCOL OUTLINE: Patients receive eith...,OBJECTIVES: I. Ascertain whether stem c...,1999-10-18,1999
828,NCT00003122,"November 1, 1999",National Cancer Institute (NCI),Surgery in Treating Patients With Neuroblastoma,Neuroblastoma,OBJECTIVES: - Evaluate the sa...,RATIONALE: Surgery may be an effective ...,1999-11-01,1999
557,NCT00003545,"November 1, 1999",National Cancer Institute (NCI),506U78 in Treating Patients With Refractory or...,Leukemia,OBJECTIVES: I. Determine the com...,Phase II trial to study the effectivene...,1999-11-01,1999
117,NCT00003270,"November 1, 1999",Roswell Park Cancer Institute,"Chemotherapy, Radiation Therapy, and Umbilical...",Graft Versus Host Disease,"OBJECTIVES: I. Determine the safety, ef...",RATIONALE: Drugs used in chemotherapy u...,1999-11-01,1999
72,NCT00002786,"November 1, 1999",Fred Hutchinson Cancer Research Center,Biological Therapy in Treating Patients With M...,Melanoma (Skin),OBJECTIVES: - Assess the safe...,RATIONALE: Biological therapies use dif...,1999-11-01,1999


In [17]:
df.memory_usage()

Index               8000
id                  8000
original_date       8000
source              8000
brief_title         8000
condition           8000
full_description    8000
summary             8000
full_date           8000
year                8000
dtype: int64

## Export dataframe as csv

In [18]:
# Export dataframe as csv file

path_to_csv = os.path.abspath('../data/csv/')

try:
    os.mkdir(path_to_csv)
    print('{} created'.format(csv))
except IOError as e:
    print(e)
    pass




NameError: name 'csv' is not defined

In [19]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)