# Clinical trials: Dataframe setup


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

In [3]:
# Path for json folder & file
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_trials_json' #name json file  

## Import json file in a dataframe/sframe

In [4]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

In [5]:
# breaks with large json file
df = pd.read_json(file)

## Basic data cleaning

In [6]:
#Check data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
nct_id                               1000 non-null object
study_first_submitted                1000 non-null object
source                               1000 non-null object
brief_title                          1000 non-null object
study_type                           1000 non-null object
condition                            1000 non-null object
condition_browse/mesh_term           1000 non-null object
intervention_browse/mesh_term        1000 non-null object
detailed_description/textblock       1000 non-null object
brief_summary/textblock              1000 non-null object
location/facility/address/city       1000 non-null object
location/facility/address/country    1000 non-null object
location/facility/address/zip        1000 non-null object
dtypes: object(13)
memory usage: 101.6+ KB


In [7]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,study_type,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip
0,NCT02802423,"June 13, 2016","BioLite, Inc.","A Phase I/II, Open Label Study to Evaluate the...",Interventional,Triple Negative Breast Cancer,Breast Neoplasms,Docetaxel,,\n The primary purpose of this study is t...,,,
1,NCT02278575,"October 28, 2014",Vastra Gotaland Region,Atenativ Effect on Uterine Blood Flow and Pree...,Interventional,Antithrombin III Deficiency,Pre-Eclampsia,Antithrombins,\n The study will be an open controlled p...,\n The study will be an open controlled p...,Gothenburg,Sweden,41685
2,NCT00921440,"June 15, 2009",University of Cologne,Computed Tomography Coronary Angiography (CTCA...,Observational,Coronary Artery Disease,Coronary Artery Disease,,\n A total of 50 patients prospectively u...,\n The investigators' rationale was to ev...,,,
3,NCT03706820,"September 29, 2018",Aristotle University Of Thessaloniki,Exercise Hemodynamics in Patients With Pulmona...,Observational,Exercise Pulmonary Hypertension,"Hypertension, Pulmonary",,\n Patients with fibrotic pulmonary disea...,\n The study evaluates the rest and exerc...,Thessaloniki,Greece,57010
4,NCT00913380,"May 15, 2009",Seoul National University Bundang Hospital,Diagnosis of Acute Appendicitis: Low-dose Comp...,Interventional,Appendicitis,Appendicitis,,\n Acute appendicitis is a very common di...,\n The purpose of this study is to determ...,Seongnam,"Korea, Republic of",463-707


In [8]:
# remove \n values
df = df.replace(r'\n',' ', regex=True) 

In [9]:
df.head(5)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,study_type,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip
0,NCT02802423,"June 13, 2016","BioLite, Inc.","A Phase I/II, Open Label Study to Evaluate the...",Interventional,Triple Negative Breast Cancer,Breast Neoplasms,Docetaxel,,The primary purpose of this study is to...,,,
1,NCT02278575,"October 28, 2014",Vastra Gotaland Region,Atenativ Effect on Uterine Blood Flow and Pree...,Interventional,Antithrombin III Deficiency,Pre-Eclampsia,Antithrombins,The study will be an open controlled pi...,The study will be an open controlled pi...,Gothenburg,Sweden,41685
2,NCT00921440,"June 15, 2009",University of Cologne,Computed Tomography Coronary Angiography (CTCA...,Observational,Coronary Artery Disease,Coronary Artery Disease,,A total of 50 patients prospectively un...,The investigators' rationale was to eva...,,,
3,NCT03706820,"September 29, 2018",Aristotle University Of Thessaloniki,Exercise Hemodynamics in Patients With Pulmona...,Observational,Exercise Pulmonary Hypertension,"Hypertension, Pulmonary",,Patients with fibrotic pulmonary diseas...,The study evaluates the rest and exerci...,Thessaloniki,Greece,57010
4,NCT00913380,"May 15, 2009",Seoul National University Bundang Hospital,Diagnosis of Acute Appendicitis: Low-dose Comp...,Interventional,Appendicitis,Appendicitis,,Acute appendicitis is a very common dis...,The purpose of this study is to determi...,Seongnam,"Korea, Republic of",463-707


In [10]:
# Rename columns
df.columns = ['id', 'submission_date','source', 'brief_title', 'study_type', 'condition', 'mesh_term_condition', 'mesh_term_intervention', 'full_description', 'summary', 'city', 'country', 'zip']

In [11]:
df.head()

Unnamed: 0,id,submission_date,source,brief_title,study_type,condition,mesh_term_condition,mesh_term_intervention,full_description,summary,city,country,zip
0,NCT02802423,"June 13, 2016","BioLite, Inc.","A Phase I/II, Open Label Study to Evaluate the...",Interventional,Triple Negative Breast Cancer,Breast Neoplasms,Docetaxel,,The primary purpose of this study is to...,,,
1,NCT02278575,"October 28, 2014",Vastra Gotaland Region,Atenativ Effect on Uterine Blood Flow and Pree...,Interventional,Antithrombin III Deficiency,Pre-Eclampsia,Antithrombins,The study will be an open controlled pi...,The study will be an open controlled pi...,Gothenburg,Sweden,41685
2,NCT00921440,"June 15, 2009",University of Cologne,Computed Tomography Coronary Angiography (CTCA...,Observational,Coronary Artery Disease,Coronary Artery Disease,,A total of 50 patients prospectively un...,The investigators' rationale was to eva...,,,
3,NCT03706820,"September 29, 2018",Aristotle University Of Thessaloniki,Exercise Hemodynamics in Patients With Pulmona...,Observational,Exercise Pulmonary Hypertension,"Hypertension, Pulmonary",,Patients with fibrotic pulmonary diseas...,The study evaluates the rest and exerci...,Thessaloniki,Greece,57010
4,NCT00913380,"May 15, 2009",Seoul National University Bundang Hospital,Diagnosis of Acute Appendicitis: Low-dose Comp...,Interventional,Appendicitis,Appendicitis,,Acute appendicitis is a very common dis...,The purpose of this study is to determi...,Seongnam,"Korea, Republic of",463-707


## Add new date columns

In [12]:
# Create new column: study_first_submitted as dates
%time df['full_date'] = pd.to_datetime(df['submission_date'])

CPU times: user 123 ms, sys: 4.07 ms, total: 127 ms
Wall time: 124 ms


In [13]:
# Create new column: dates as year
df['year'] = df['full_date'].dt.year

In [14]:
# Delete submission date column
df.drop('submission_date', axis=1, inplace=True)

In [15]:
# Sort records by date
df = df.sort_values(by ='full_date')

In [16]:
df.head()

Unnamed: 0,id,source,brief_title,study_type,condition,mesh_term_condition,mesh_term_intervention,full_description,summary,city,country,zip,full_date,year
499,NCT00000271,New York State Psychiatric Institute,New Approaches to Cocaine Abuse Medications (A...,Interventional,Cocaine-Related Disorders,Disease,Cocaine,"This is a randomized, 12-week, double-b...",The purpose of this study is to measure...,New York,United States,10032,1999-09-20,1999
957,NCT00004475,University of Pittsburgh,Genetic Linkage Study for Hereditary Pancreatitis,Observational,Pancreatitis,Pancreatitis,,Hereditary Pancreatitis (HP) is an infl...,The purpose of this study is to establi...,Pittsburgh,United States,15213-2582,1999-10-18,1999
904,NCT00003133,Memorial Sloan Kettering Cancer Center,Combination Chemotherapy Following Surgery in ...,Interventional,Bladder Cancer,Urinary Bladder Neoplasms,Paclitaxel,OBJECTIVES: I. Determine the safety and...,RATIONALE: Drugs used in chemotherapy u...,New York,United States,10021,1999-11-01,1999
302,NCT00002863,University of Southern California,Cryosurgery in Treating Patients With Soft Tis...,Interventional,Sarcoma,Sarcoma,,OBJECTIVES: I. Assess the safety and mo...,RATIONALE: Cryosurgery kills cancer cel...,Los Angeles,United States,90033-0800,1999-11-01,1999
513,NCT00003585,University of New Mexico,Biological Therapy Plus Chemotherapy in Treati...,Interventional,Kidney Cancer,Kidney Neoplasms,Aldesleukin,OBJECTIVES: I. Evaluate the toxic effec...,RATIONALE: Biological therapies use dif...,Albuquerque,United States,87131,1999-11-01,1999


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 499 to 218
Data columns (total 14 columns):
id                        1000 non-null object
source                    1000 non-null object
brief_title               1000 non-null object
study_type                1000 non-null object
condition                 1000 non-null object
mesh_term_condition       1000 non-null object
mesh_term_intervention    1000 non-null object
full_description          1000 non-null object
summary                   1000 non-null object
city                      1000 non-null object
country                   1000 non-null object
zip                       1000 non-null object
full_date                 1000 non-null datetime64[ns]
year                      1000 non-null int64
dtypes: datetime64[ns](1), int64(1), object(12)
memory usage: 117.2+ KB


## Subset of data

In [18]:
# Select data since 2008
df = df[df['year'] > 2007]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 826 entries, 327 to 218
Data columns (total 14 columns):
id                        826 non-null object
source                    826 non-null object
brief_title               826 non-null object
study_type                826 non-null object
condition                 826 non-null object
mesh_term_condition       826 non-null object
mesh_term_intervention    826 non-null object
full_description          826 non-null object
summary                   826 non-null object
city                      826 non-null object
country                   826 non-null object
zip                       826 non-null object
full_date                 826 non-null datetime64[ns]
year                      826 non-null int64
dtypes: datetime64[ns](1), int64(1), object(12)
memory usage: 96.8+ KB


In [20]:
df.mesh_term_condition.nunique()

278

In [21]:
df.condition.nunique()

622

In [22]:
df.mesh_term_intervention.nunique()

216

## Export dataframe as csv

In [None]:
# Export dataframe as csv file

path_to_csv = os.path.abspath('../data/csv/')

try:
    os.mkdir(path_to_csv)
    print('{} created'.format(csv))
except IOError as e:
    print(e)
    pass




In [None]:
csv_file = '/clean_data.csv'
df.to_csv(path_to_csv + csv_file)