# Clinical trials: ML Pipeline


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

# sys.stdout = open('/dev/stdout', 'w')

In [3]:
# Path for csv folder & file
path_to_csv_file = os.path.abspath('../data/csv/')

# json file
csv_file = '/clean_data'

## Import csv file

In [4]:
# Import json into a dataframe
clean_csv_file = '{}{}.csv'.format(path_to_csv_file, csv_file)

In [5]:
'''Use dask to improve data loading
https://www.kaggle.com/shikhar1/yet-another-pandas-tutorial'''

# breaks with large json file
df = pd.read_csv(clean_csv_file)

In [6]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,id,source,brief_title,condition,mesh_term_condition,mesh_term_intervention,full_description,summary,city,country,zip,full_date,year
184577,184931,NCT02352441,Allina Health System,Cognitive Rehabilitation:ACTION Training for S...,Traumatic Brain Injury,Brain Injuries,,The primary aims of this feasibility st...,Many Service members (SM) experience ex...,Fort Campbell North,United States,42223,2015-01-23,2015
276266,228016,NCT03541304,Sun Yat-sen University,High Dose Radiotherapy for the Treatment of Re...,Rectal Cancer,Rectal Neoplasms,,,While surgery remains the standard trea...,Guangzhou,China,510060,2018-05-18,2018
228075,15412,NCT02914730,Joslin Diabetes Center,Insulin Dosing Practices in Persons With Diabe...,Diabetes,,,,In this cross-sectional study the inves...,Boston,United States,02215,2016-09-22,2016
2889,16871,NCT00001851,National Institutes of Health Clinical Center ...,Bone Marrow Injection to Replace Diseased Bone...,Polyostotic Fibrous Dysplasia,"Fibrous Dysplasia, Polyostotic",,Polyostotic fibrous dysplasia (PFD) is ...,This study will evaluate the effectiven...,Bethesda,United States,20892,1999-11-03,1999
54459,87020,NCT00672113,Bayer,Effects of Adalat LA and Coracten on Drug Leve...,Hypertension,Hypertension,Nifedipine,,This study compares the effect of Adala...,Cambridge,United Kingdom,CB2 2XY,2008-04-16,2008
32705,263311,NCT00376272,Gruppo di Ricerca GISSI,GISSI-AF - Use of Valsartan an Angiotensin II ...,Atrial Fibrillation,Atrial Fibrillation,Valsartan,The protocol is sponsored by an indepen...,Study purpose The purpose of the study ...,Cortona,Italy,52040,2006-09-12,2006
238773,234723,NCT03054662,Cliniques universitaires Saint-Luc- Université...,Non Substitutive Strategies to Improve Haemoph...,Haemophilia,Hemophilia A,Hemostatics,1. Establish a baseline of the conditio...,This study aims to make an inventory of...,Abidjan,Côte D'Ivoire,,2017-02-13,2017
162178,223876,NCT02077595,Chang Gung University,Effects of Transcutaneous Electrical Nerve Sti...,Transcutaneous Electrical Nerve Stimulation,,,,"Recently, non-invasive brain stimulatio...",Taoyuan,Taiwan,333,2014-02-11,2014
148279,154214,NCT01888432,Novartis,Efficacy and Safety of Everolimus in Liver Tra...,Liver Transplantation,,Tacrolimus,"This study was 24 month, multicenter st...",The purpose of this trial wa to demonst...,Los Angeles,United States,90033,2013-06-15,2013
67518,58123,NCT00850096,CPEX Pharmaceuticals Inc.,Effects of Nasulin Versus Placebo on Blood Glu...,Type 2 Diabetes,"Diabetes Mellitus, Type 2","Insulin, Globin Zinc",This proof of concept trial has a rando...,The purpose of this study is: ...,Chandler,United States,85225,2009-01-30,2009


In [7]:
df.shape

(292311, 14)

In [8]:
df.columns

Index(['Unnamed: 0', 'id', 'source', 'brief_title', 'condition',
       'mesh_term_condition', 'mesh_term_intervention', 'full_description',
       'summary', 'city', 'country', 'zip', 'full_date', 'year'],
      dtype='object')

In [9]:
# remove Unnamed column
df = df.drop(columns = ['Unnamed: 0'])

## Optimize memory usage

In [10]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

Index                            80
id                         19877148
source                     25074554
brief_title                42416130
condition                  22395698
mesh_term_condition        20206540
mesh_term_intervention     18946653
full_description          476481119
summary                   220097905
city                       19067818
country                    19413681
zip                        18157364
full_date                  19584837
year                        2338488
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292311 entries, 0 to 292310
Data columns (total 13 columns):
id                        292311 non-null object
source                    292311 non-null object
brief_title               292311 non-null object
condition                 292311 non-null object
mesh_term_condition       292311 non-null object
mesh_term_intervention    292311 non-null object
full_description          292311 non-null object
summary                   292

In [11]:
# Change data types
df['id'] = df['id'].astype('str')
for col in ['source', 'condition', 'city', 'country']:
    df[col] = df[col].astype('category')

In [12]:
df['full_date'] =  pd.to_datetime(df['full_date'])

In [13]:
for col in ['zip', 'mesh_term_condition', 'mesh_term_intervention']:
    df[col] = df[col].astype('str')

In [14]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

Index                            80
id                         19877148
source                      2874447
brief_title                42416130
condition                   6676111
mesh_term_condition        20206540
mesh_term_intervention     18946653
full_description          476481119
summary                   220097905
city                        1723713
country                      602746
zip                        18157364
full_date                   2338488
year                        2338488
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292311 entries, 0 to 292310
Data columns (total 13 columns):
id                        292311 non-null object
source                    292311 non-null category
brief_title               292311 non-null object
condition                 292311 non-null category
mesh_term_condition       292311 non-null object
mesh_term_intervention    292311 non-null object
full_description          292311 non-null object
summary                  

## Delete records before 2008

In [15]:
# keep records since 2008
df = df[df['year'] > 2007]

In [16]:
df.head()

Unnamed: 0,id,source,brief_title,condition,mesh_term_condition,mesh_term_intervention,full_description,summary,city,country,zip,full_date,year
49212,NCT00603590,Tehran University of Medical Sciences,Phase II Study of Heart Polypill Safety and Ef...,Cardiovascular Disease,Cardiovascular Diseases,Hydrochlorothiazide,,Cardiovascular is a major cause of mort...,Kalaleh,"Iran, Islamic Republic of",,2008-01-01,2008
49213,NCT00589602,The Cleveland Clinic,"T-Cell Depletion, Donor Hematopoietic Stem Cel...",Chronic Myeloproliferative Disorders,Multiple Myeloma,,OBJECTIVES: Primary - ...,RATIONALE: Giving chemotherapy and tota...,Cleveland,United States,44195,2008-01-01,2008
49214,NCT00592553,PTC Therapeutics,Phase 2b Study of PTC124 in Duchenne/Becker Mu...,Duchenne Muscular Dystrophy,Muscular Dystrophies,,"This study is a Phase 2b, multicenter, ...",Duchenne/Becker muscular dystrophy (DMD...,Sacramento,United States,45229-3039,2008-01-01,2008
49215,NCT00592332,Vanderbilt University,Hypoglycemia Associated Autonomic Failure in T...,Type 1 Diabetes,"Diabetes Mellitus, Type 1",Alprazolam,Due to the fundamental importance of gl...,Alprazolam (Xanax) will blunt the body'...,Nashville,United States,37232,2008-01-01,2008
49216,NCT00585312,Pfizer,Trial In Pediatric Patients With Familial Aden...,Adenomatous Polyposis Coli,Colorectal Neoplasms,Celecoxib,"Per DMC recommendation, the study was t...",To test whether celecoxib can be used t...,Chicago,United States,60612,2008-01-01,2008


In [17]:
print(df.shape)
print(df.nunique())
print('\n{}'.format(df.year.unique()))

(243099, 13)
id                        243099
source                     17558
brief_title               241687
condition                  45295
mesh_term_condition         2267
mesh_term_intervention      2246
full_description          153312
summary                   241037
city                       10628
country                      191
zip                        20845
full_date                   3982
year                          11
dtype: int64

[2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018]


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243099 entries, 49212 to 292310
Data columns (total 13 columns):
id                        243099 non-null object
source                    243099 non-null category
brief_title               243099 non-null object
condition                 243099 non-null category
mesh_term_condition       243099 non-null object
mesh_term_intervention    243099 non-null object
full_description          243099 non-null object
summary                   243099 non-null object
city                      243099 non-null category
country                   243099 non-null category
zip                       243099 non-null object
full_date                 243099 non-null datetime64[ns]
year                      243099 non-null int64
dtypes: category(4), datetime64[ns](1), int64(1), object(7)
memory usage: 23.7+ MB


## Save conditions, mesh terms and sources

In [22]:
def save_list_of_terms(column, final_list):
    for term in df[column]:
        final_list.append(term)
        # Keep only unique values
    final_list = sorted(set(final_list))
    print('Number of {}: {}'.format(column, len(final_list)))

In [23]:
all_conditions = []
save_list_of_terms('condition', all_conditions)

Number of condition: 45295


In [24]:
all_mesh_conditions = []
save_list_of_terms('mesh_term_condition', all_mesh_conditions)

Number of mesh_term_condition: 2267


In [26]:
all_mesh_interventions = []
save_list_of_terms('mesh_term_intervention', all_mesh_interventions)

Number of mesh_term_intervention: 2246


In [27]:
all_sources = []
save_list_of_terms('source', all_sources)

Number of source: 17558


## Reference: List of conditions
- ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2019/

In [None]:
import xml.etree.ElementTree as ET

In [None]:
list_conditions = '../data/icd10/'

conditions = []

for xml in os.listdir(list_conditions):
    conditions.append(xml)

print(conditions)

In [None]:
icd = list_conditions + conditions[0] #full path
tree = ET.parse(icd).getroot()
print('Parsed: {}\n'.format(conditions[0]))

In [None]:
#Save titles from xml file in a list
all_conditions = []
    
for i in tree.iter('desc'):
    all_conditions.append(i.text)
    
len(all_conditions)

In [None]:
# Remove long names
result = []

def count_words_list(words,size):
    for w in words:
        if len(w.split())<=size:
            result.append(w)
    return result

count_words_list(all_conditions, 3)
len(result)

In [None]:
# clean conditions dataset
all_conditions = [w.replace('unspecified', '') for w in all_conditions]
all_conditions = [w.replace(',', '') for w in all_conditions]

In [None]:
# Eliminate duplicated records
all_conditions = list(set(result))
len(all_conditions)

## Classify docs using a list of conditions

In [None]:
# Create a column with all text
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [None]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### Check for values in conditions list

In [None]:
# Create dataframe to store vocabulary
vocabulary_conditions = pd.DataFrame()

In [None]:
# PENDING: faster function
# https://stackoverflow.com/questions/44960614/whats-the-fastest-way-to-acces-a-pandas-dataframe
for term in all_conditions:
    try:
        vocabulary_conditions[term] = df['text'].str.contains(term, case = False).astype(int)
        print(term)
    except:
        pass
              

In [None]:
vocabulary_conditions.info()
#463 Mb file

In [None]:
# Discard columns above a treshold
vocabulary_conditions.describe()

## ML Pipeline

### DF transformations
- Check which ones can be added to the preprocessor
- Info: http://www.ultravioletanalytics.com/blog/tf-idf-basics-with-pandas-scikit-learn

In [None]:
# Create a column combining title + full description
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [None]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [None]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 3000,
                     lowercase = True,
                     max_df = 0.5,
                     ngram_range = (1, 3))

In [None]:
docs = list(df['full_description'])

X = cv.fit_transform(docs)

In [None]:
len(cv.vocabulary_)

In [None]:
cv.vocabulary_

In [None]:
# Check for values in dictionary
if 'breast cancer' in list(cv.vocabulary_):
  print('yes')
else:
  print('no')

In [None]:
pd.DataFrame(X.toarray(), columns=cv.get_feature_names()).head()

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

In [None]:
# Pending: Reduced dimensionality



In [None]:
# turn weights data into a dataframe
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())

In [None]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(5)

In [None]:
# Check word
def check_weight(word):
    for w in word:
        try:
            print('{}: {}'.format(w, tf[w].mean()))
        except KeyError:
            print('{}: None'.format(w))

search_terms = ['cancer', 'breast cancer', 'migraine', 'sarcoma']
check_weight(search_terms)

In [None]:
# Merge df y tf-idf data by index
pd.set_option('display.max_columns', None) 
tf.sample(5)
# pd.merge(df, tf, left_index=True, right_index=True).sample(1)

### Topic Modeling
- https://nlpforhackers.io/topic-modeling/

### Test: Using Tensorflow 
- https://github.com/open-source-for-science/TensorFlow-Course?utm_campaign=explore-email&utm_medium=email&utm_source=newsletter&utm_term=weekly