# Clinical trials: ML Classification


In [156]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

# sys.stdout = open('/dev/stdout', 'w')

In [3]:
# Path for csv folder & file
path_to_csv_file = os.path.abspath('../data/csv/')

# json file
csv_file = '/clean_data'

## Import csv file

In [4]:
# Import json into a dataframe
clean_csv_file = '{}{}.csv'.format(path_to_csv_file, csv_file)

In [5]:
'''Use dask to improve data loading
https://www.kaggle.com/shikhar1/yet-another-pandas-tutorial'''

# breaks with large json file
df = pd.read_csv(clean_csv_file)

In [6]:
df.shape

(10000, 10)

In [7]:
# remove Unnamed column
df = df.drop(columns = ['Unnamed: 0'])
df = df.drop(columns = ['original_date'])

In [8]:
df.columns

Index(['id', 'source', 'brief_title', 'condition', 'full_description',
       'summary', 'full_date', 'year'],
      dtype='object')

## Optimize memory usage

In [9]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

Index                     80
id                    680000
source                854529
brief_title          1447701
condition             766621
full_description    15993235
summary              7424117
full_date             670000
year                   80000
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
id                  10000 non-null object
source              10000 non-null object
brief_title         10000 non-null object
condition           10000 non-null object
full_description    10000 non-null object
summary             10000 non-null object
full_date           10000 non-null object
year                10000 non-null int64
dtypes: int64(1), object(7)
memory usage: 625.1+ KB


In [10]:
df.dtypes

id                  object
source              object
brief_title         object
condition           object
full_description    object
summary             object
full_date           object
year                 int64
dtype: object

In [11]:
# Change data types
df['id'] = df['id'].astype('str')
for col in ['source', 'condition']:
    df[col] = df[col].astype('category')

In [12]:
df['full_date'] =  pd.to_datetime(df['full_date'])

In [13]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

Index                     80
id                    680000
source                468036
brief_title          1447701
condition             537645
full_description    15993235
summary              7424117
full_date              80000
year                   80000
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
id                  10000 non-null object
source              10000 non-null category
brief_title         10000 non-null object
condition           10000 non-null category
full_description    10000 non-null object
summary             10000 non-null object
full_date           10000 non-null datetime64[ns]
year                10000 non-null int64
dtypes: category(2), datetime64[ns](1), int64(1), object(4)
memory usage: 887.2+ KB


In [14]:
print(df.shape)
print(df.nunique())

(10000, 8)
id                  10000
source               3222
brief_title          9980
condition            4371
full_description     6469
summary              9974
full_date            3675
year                   20
dtype: int64


## Test: Classification through a list of terms
Filtering by list of words related to cancer research. 

In [15]:
# 1st approach: use a list of terms as reference
cancer_words = ['cancer', 'oncology', 'melanoma', 'sarcoma']
pat = '|'.join([r'\b{}\b'.format(cw) for cw in cancer_words])

In [16]:
df['cancer'] = df['summary'].str.contains(pat, case = False).astype(int)

In [17]:
# Create a new dataset for records with cancer terms
df_cancer = df[df['cancer'] == 1]

In [18]:
df_cancer.nunique()

id                  1273
source               555
brief_title         1273
condition            538
full_description     900
summary             1273
full_date            989
year                  20
cancer                 1
dtype: int64

In [19]:
df_cancer['condition'].unique()

[Cardiovascular Diseases, Breast Cancer, Colorectal Cancer, Drug/Agent Toxicity by Tissue/Organ, Brain and Central Nervous System Tumors, ..., CRE-induced SBO, Clinical Stage III Gastric Cancer AJCC v8, Hematologic Malignancy, Shoulder Dysfunction, Chronic Phase Chronic Myelogenous Leukemia, BC...]
Length: 538
Categories (538, object): [Cardiovascular Diseases, Breast Cancer, Colorectal Cancer, Drug/Agent Toxicity by Tissue/Organ, ..., Clinical Stage III Gastric Cancer AJCC v8, Hematologic Malignancy, Shoulder Dysfunction, Chronic Phase Chronic Myelogenous Leukemia, BC...]

In [20]:
# Registries across diseases: Cancer & Cardiovascular disease
df_cancer.loc[(df_cancer['condition'] == 'Cardiovascular Diseases') & (df_cancer['cancer'] == 1)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 8 to 8
Data columns (total 9 columns):
id                  1 non-null object
source              1 non-null category
brief_title         1 non-null object
condition           1 non-null category
full_description    1 non-null object
summary             1 non-null object
full_date           1 non-null datetime64[ns]
year                1 non-null int64
cancer              1 non-null int64
dtypes: category(2), datetime64[ns](1), int64(2), object(4)
memory usage: 379.4+ KB


## ML Pipeline

### DF transformations
- Check which ones can be added to the preprocessor
- Info: http://www.ultravioletanalytics.com/blog/tf-idf-basics-with-pandas-scikit-learn

In [21]:
# Create a column combining title + full description
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [22]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### Create a vocabulary

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [146]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 10000,
                     lowercase = True,
                     max_df = 0.7,
                     ngram_range = (2, 2))

In [147]:
docs = list(df['full_description'])

X = cv.fit_transform(docs)

In [148]:
len(cv.vocabulary_)

10000

In [149]:
cv.vocabulary_

{'medication treatment': 4862,
 'determine effects': 2246,
 'physiological effects': 6397,
 'evaluate clinical': 2880,
 'clinical safety': 1402,
 'safety issues': 7584,
 'pharmacological treatments': 6335,
 'withdrawal symptoms': 9903,
 'smoking cessation': 8008,
 'study investigated': 8355,
 'mg kg': 4931,
 'reaction time': 7096,
 'subjective measures': 8515,
 'measures included': 4822,
 'heart rate': 3669,
 'rate blood': 7067,
 'blood pressure': 876,
 'signs symptoms': 7918,
 'kg dose': 4387,
 'protocol outline': 6857,
 'outline randomized': 5602,
 'randomized study': 7048,
 'study patients': 8396,
 'patients randomized': 6101,
 'randomized treatment': 7050,
 'treatment arms': 9118,
 'arms patients': 490,
 'patients arms': 5903,
 'receive doses': 7119,
 'day days': 2042,
 'days patients': 2114,
 'patients arm': 5902,
 'arm iii': 471,
 'patients begin': 5914,
 'standard therapy': 8107,
 'outline patients': 5600,
 'patients undergo': 6177,
 'undergo clinical': 9392,
 'patients studied'

In [152]:
# Check for values in dictionary
if 'sarcoma' in list(cv.vocabulary_):
  print('yes')
else:
  print('no')

no


In [151]:
pd.DataFrame(X.toarray(), columns=cv.get_feature_names()).head()

Unnamed: 0,abdomen pelvis,abdominal adipose,abdominal cavity,abdominal pain,abdominal pelvic,abdominal pressure,abdominal surgery,abdominal wall,abiraterone acetate,ablation procedure,...,young children,young patients,young women,youth court,ziv aflibercept,µg day,µg dose,µg kg,µg ml,μg kg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF Model

In [153]:
from sklearn.feature_extraction.text import TfidfTransformer

In [162]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 188006 stored elements in Compressed Sparse Row format>

In [178]:
# Pending: Reduced dimensionality



In [173]:
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())
tf['patients receive'].mean()

0.0068650750900604495

In [174]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(5)

Unnamed: 0,term,weight
6103,patients receive,0.006865
6933,quality life,0.006292
4619,long term,0.005165
5489,open label,0.004824
2537,double blind,0.004796


### Topic Modeling
- https://nlpforhackers.io/topic-modeling/

### Test: Using Tensorflow 
- https://github.com/open-source-for-science/TensorFlow-Course?utm_campaign=explore-email&utm_medium=email&utm_source=newsletter&utm_term=weekly