# MVP workflow
- Data from clinicaltrials.gov

In [1]:
import json, os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Load json file with parsed information

In [2]:
# paths
path_to_json_file = os.path.abspath('../data/json/')

# json file
json_file = '/all_parsed_data_json' #name json file  
file = '{}{}.json'.format(path_to_json_file, json_file)

# load json file
df = pd.read_json(file)

In [3]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip
0,NCT00391586,"October 23, 2006",New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,"Carcinoma, Non-Small-Cell Lung",Carcinoma,Paclitaxel,\n To compare the activities (the progres...,\n This study was conducted to compare th...,Albuquerque,United States,87102
1,NCT03472664,"March 12, 2018",Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Alzheimer Disease,Alzheimer Disease,,\n This study will examine the effects of...,\n The Brain Energy for Amyloid Transform...,Winston-Salem,United States,27157
2,NCT02443948,"March 19, 2015",Fondazione del Piemonte per l'Oncologia,Circulating Cell-free Tumor DNA in the Plasma ...,Gastrointestinal Stromal Tumor (GIST),Gastrointestinal Stromal Tumors,,"\n Demetri and colleagues presented, at t...",\n This observational study is proposed t...,Candiolo,Italy,10060
3,NCT03757312,"November 27, 2018",Nationwide Children's Hospital,A Pilot Study of Optic Nerve Ultrasound Follow...,Congenital Heart Disease,Heart Diseases,,,\n The purpose of the proposed study is t...,Columbus,United States,43205
4,NCT02818309,"June 21, 2016",National Taiwan University Hospital,Lesogaberan in Chinese Patients With Refractor...,Gastroesophageal Reflux Disease,Gastroesophageal Reflux,Lesogaberan,\n To test the efficacy and safety of les...,\n Lesogaberan may be used in Chinese GER...,Taipei,Taiwan,100


## Preprocess and clean data

In [4]:
# change column type
def data_types(dataframe, cols = [], to_type = ''):
    for col in cols:
        dataframe[col] = df[col].astype(to_type)

In [5]:
columns_dates = ['study_first_submitted', 'study_first_posted', 'last_update_submitted', 'last_update_posted', 'verification_date']
data_types(df, columns_dates, 'datetime64')

KeyError: 'study_first_posted'

In [9]:
# Extract year 
df['year_submitted'] = df['study_first_submitted'].dt.year
df['year_last_updated'] = df['last_update_submitted'].dt.year

In [10]:
# Remove /n in all df
df = df.replace(r'\n',' ', regex=True)

In [38]:
# Create new column for all text
df['all_text'] = df['source'] + ' ' + df['brief_title'] + ' ' + df['condition'] + ' ' + df['condition_browse/mesh_term'] + ' '+ df['intervention_browse/mesh_term'] + ' '+ df['detailed_description/textblock'] + ' ' + df['brief_summary/textblock']

# All_text in lowercase
df['all_text'] = df['all_text'].str.lower()

# remove extra whitespace
df.all_text = df.all_text.replace('\s+', ' ', regex=True)


In [39]:
df['all_text'][0]

'new mexico cancer care alliance erlotinib and standard platinum-based chemotherapy for newly diagnosed, advanced non-small cell carcinoma of the lung carcinoma, non-small-cell lung carcinoma paclitaxel to compare the activities (the progression-free survival, the incidence and severity of toxicities, and reversibility of toxicities) of erlotinib to that of platinum-based therapy in nsclc. a sequential therapy design has been chosen such that all patients will receive any potential benefits of both platinum-based and erlotinib therapy, without compromising survival by denying anyone potential therapy. with this design, progression-free survival will be tracked by treatment received. however, data will be generated which will show the safety and efficacy of erlotinib in the frontline setting (alone and with historical comparison to platinum-based therapy), as well as the potential safety and activity of platinum-based therapy in the "second-line" (post-erlotinib) setting. this should al

In [51]:
# Remove extra white space in summary
df['brief_summary/textblock'] = df['brief_summary/textblock'].replace('\s+', ' ', regex=True)
df['brief_summary/textblock'][0]


' This study was conducted to compare the activities of erlotinib to that of intravenous, platinum-based therapy in the treatment of non-small cell lung cancer (NSCLC). The goal of this trial was to demonstrate clinical equivalence of erlotinib to platinum-based frontline therapy, compared to historical controls. '