# Clinical trials: ML Pipeline


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np

In [None]:
base_color = "#3298D0"
plot_size = (14, 10)

# sys.stdout = open('/dev/stdout', 'w')

In [None]:
# Path for csv folder & file
path_to_csv_file = os.path.abspath('../data/csv/')

# json file
csv_file = '/clean_data'

## Import csv file

In [None]:
# Import json into a dataframe
clean_csv_file = '{}{}.csv'.format(path_to_csv_file, csv_file)

In [None]:
'''Use dask to improve data loading
https://www.kaggle.com/shikhar1/yet-another-pandas-tutorial'''

# breaks with large json file
df = pd.read_csv(clean_csv_file)

In [None]:
df.info()

In [None]:
df.sample(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# remove Unnamed column
df = df.drop(columns = ['Unnamed: 0'])

## Optimize memory usage

In [None]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

In [None]:
# Change data types
df['id'] = df['id'].astype('str')
for col in ['source', 'condition', 'city', 'country']:
    df[col] = df[col].astype('category')

In [None]:
df['full_date'] =  pd.to_datetime(df['full_date'])

In [None]:
for col in ['zip', 'mesh_term_condition', 'mesh_term_intervention']:
    df[col] = df[col].astype('str')

In [None]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

## Delete records before 2008

In [None]:
# keep records since 2008
df = df[df['year'] > 2007]

In [None]:
df.head()

In [None]:
print(df.shape)
print(df.nunique())
print('\n{}'.format(df.year.unique()))

In [None]:
df.info()

## Save conditions, mesh terms and sources

In [None]:
def save_list_of_terms(column, final_list):
    for term in df[column]:
        final_list.append(term)
        # Keep only unique values
    final_list = sorted(set(final_list))
    print('Number of {}: {}'.format(column, len(final_list)))

In [None]:
all_conditions = []
save_list_of_terms('condition', all_conditions)

In [None]:
all_mesh_conditions = []
save_list_of_terms('mesh_term_condition', all_mesh_conditions)

In [None]:
all_mesh_interventions = []
save_list_of_terms('mesh_term_intervention', all_mesh_interventions)

In [None]:
all_sources = []
save_list_of_terms('source', all_sources)

In [None]:
df.head()

## Reference: List of conditions
- ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2019/

In [None]:
import xml.etree.ElementTree as ET

In [None]:
list_conditions = '../data/icd10/'

conditions = []

for xml in os.listdir(list_conditions):
    conditions.append(xml)

print(conditions)

In [None]:
icd = list_conditions + conditions[0] #full path
tree = ET.parse(icd).getroot()
print('Parsed: {}\n'.format(conditions[0]))

In [None]:
#Save titles from xml file in a list
all_conditions = []
    
for i in tree.iter('desc'):
    all_conditions.append(i.text)
    
len(all_conditions)

In [None]:
# Remove long names
result = []

def count_words_list(words,size):
    for w in words:
        if len(w.split())<=size:
            result.append(w)
    return result

count_words_list(all_conditions, 3)
len(result)

In [None]:
# clean conditions dataset
all_conditions = [w.replace('unspecified', '') for w in all_conditions]
all_conditions = [w.replace(',', '') for w in all_conditions]

In [None]:
# Eliminate duplicated records
all_conditions = list(set(result))
len(all_conditions)

## Classify docs using a list of conditions

In [None]:
# Create a column with all text
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [None]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### Check for values in conditions list

In [None]:
# Create dataframe to store vocabulary
vocabulary_conditions = pd.DataFrame()

In [None]:
# PENDING: faster function
# https://stackoverflow.com/questions/44960614/whats-the-fastest-way-to-acces-a-pandas-dataframe
for term in all_conditions:
    try:
        vocabulary_conditions[term] = df['text'].str.contains(term, case = False).astype(int)
        print(term)
    except:
        pass
              

In [None]:
vocabulary_conditions.info()
#463 Mb file

In [None]:
# Discard columns above a treshold
vocabulary_conditions.describe()

## ML Pipeline

### DF transformations
- Check which ones can be added to the preprocessor
- Info: http://www.ultravioletanalytics.com/blog/tf-idf-basics-with-pandas-scikit-learn

In [None]:
# Create a column combining title + full description
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [None]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [None]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 3000,
                     lowercase = True,
                     max_df = 0.5,
                     ngram_range = (1, 3))

In [None]:
docs = list(df['full_description'])

X = cv.fit_transform(docs)

In [None]:
len(cv.vocabulary_)

In [None]:
cv.vocabulary_

In [None]:
# Check for values in dictionary
if 'breast cancer' in list(cv.vocabulary_):
  print('yes')
else:
  print('no')

In [None]:
pd.DataFrame(X.toarray(), columns=cv.get_feature_names()).head()

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

In [None]:
# Pending: Reduced dimensionality



In [None]:
# turn weights data into a dataframe
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())

In [None]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(5)

In [None]:
# Check word
def check_weight(word):
    for w in word:
        try:
            print('{}: {}'.format(w, tf[w].mean()))
        except KeyError:
            print('{}: None'.format(w))

search_terms = ['cancer', 'breast cancer', 'migraine', 'sarcoma']
check_weight(search_terms)

In [None]:
# Merge df y tf-idf data by index
pd.set_option('display.max_columns', None) 
tf.sample(5)
# pd.merge(df, tf, left_index=True, right_index=True).sample(1)

### Topic Modeling
- https://nlpforhackers.io/topic-modeling/

### Test: Using Tensorflow 
- https://github.com/open-source-for-science/TensorFlow-Course?utm_campaign=explore-email&utm_medium=email&utm_source=newsletter&utm_term=weekly