# MVP workflow
- Data from clinicaltrials.gov

In [1]:
import json, os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Load json file with parsed information

In [2]:
# paths
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_parsed_data_json' #name json file  
file = '{}{}.json'.format(path_to_json_file, json_file)

# load json file
df = pd.read_json(file)

In [3]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,overall_status,verification_date,study_type,study_first_posted,last_update_submitted,last_update_posted,...,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip,sponsors/lead_sponsor/agency,sponsors/lead_sponsor/agency_class,study_design_info/allocation,study_design_info/intervention_model,study_design_info/primary_purpose
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,Terminated,August 2015,Interventional,"October 24, 2006","August 14, 2015","August 17, 2015",...,\n To compare the activities (the progres...,\n This study was conducted to compare th...,Albuquerque,United States,87102.0,New Mexico Cancer Care Alliance,Other,,Single Group Assignment,Treatment
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Recruiting,July 2018,Interventional,"March 21, 2018","July 30, 2018","August 1, 2018",...,\n This study will examine the effects of...,\n The Brain Energy for Amyloid Transform...,Winston-Salem,United States,27157.0,Wake Forest University Health Sciences,Other,Randomized,Parallel Assignment,Treatment
10,NCT01009658,"November 6, 2009",Gunma University,MSG and Gastrointestinal Motility,Completed,March 2015,Interventional,"November 9, 2009","March 25, 2015","March 26, 2015",...,\n Amino acids such as monosodium glutama...,\n The purpose of this study is to clarif...,Maebashi,Japan,3718511.0,Gunma University,Other,Randomized,Crossover Assignment,Basic Science
100,NCT03184311,"June 2, 2017","Cantonal Hosptal, Baselland",High-intensity Interval Training in Heart Fail...,Not yet recruiting,June 2017,Interventional,"June 12, 2017","June 9, 2017","June 12, 2017",...,\n Heart failure (HF) with preserved ejec...,\n This study investigates the effects of...,,,,"Cantonal Hosptal, Baselland",Other,Randomized,Parallel Assignment,Treatment
1000,NCT00093301,"October 5, 2004",Wentworth Area Health Services,Levosimendan Versus Dobutamine in Shock Patients,Unknown status,December 2011,Interventional,"October 7, 2004","June 23, 2005","June 24, 2005",...,,\n The purpose of the study is to compare...,Penrith,Australia,2750.0,Wentworth Area Health Services,Other,Randomized,Parallel Assignment,Treatment


## Preprocess and clean data

In [6]:
# change column type
def data_types(dataframe, cols = [], to_type = ''):
    for col in cols:
        dataframe[col] = df[col].astype(to_type)

In [7]:
columns_dates = ['study_first_submitted', 'study_first_posted', 'last_update_submitted', 'last_update_posted', 'verification_date']
data_types(df, columns_dates, 'datetime64')

In [9]:
# Extract year 
df['year_submitted'] = df['study_first_submitted'].dt.year
df['year_last_updated'] = df['last_update_submitted'].dt.year

In [10]:
# Remove /n in all df
df = df.replace(r'\n',' ', regex=True)

In [38]:
# Create new column for all text
df['all_text'] = df['source'] + ' ' + df['brief_title'] + ' ' + df['condition'] + ' ' + df['condition_browse/mesh_term'] + ' '+ df['intervention_browse/mesh_term'] + ' '+ df['detailed_description/textblock'] + ' ' + df['brief_summary/textblock']

# All_text in lowercase
df['all_text'] = df['all_text'].str.lower()

# remove extra whitespace
df.all_text = df.all_text.replace('\s+', ' ', regex=True)


In [39]:
df['all_text'][0]

'new mexico cancer care alliance erlotinib and standard platinum-based chemotherapy for newly diagnosed, advanced non-small cell carcinoma of the lung carcinoma, non-small-cell lung carcinoma paclitaxel to compare the activities (the progression-free survival, the incidence and severity of toxicities, and reversibility of toxicities) of erlotinib to that of platinum-based therapy in nsclc. a sequential therapy design has been chosen such that all patients will receive any potential benefits of both platinum-based and erlotinib therapy, without compromising survival by denying anyone potential therapy. with this design, progression-free survival will be tracked by treatment received. however, data will be generated which will show the safety and efficacy of erlotinib in the frontline setting (alone and with historical comparison to platinum-based therapy), as well as the potential safety and activity of platinum-based therapy in the "second-line" (post-erlotinib) setting. this should al

In [51]:
# Remove extra white space in summary
df['brief_summary/textblock'] = df['brief_summary/textblock'].replace('\s+', ' ', regex=True)
df['brief_summary/textblock'][0]


' This study was conducted to compare the activities of erlotinib to that of intravenous, platinum-based therapy in the treatment of non-small cell lung cancer (NSCLC). The goal of this trial was to demonstrate clinical equivalence of erlotinib to platinum-based frontline therapy, compared to historical controls. '