# Clinical trials: ML Classification


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

# sys.stdout = open('/dev/stdout', 'w')

In [3]:
# Path for csv folder & file
path_to_csv_file = os.path.abspath('../data/csv/')

# json file
csv_file = '/clean_data'

## Import csv file

In [4]:
# Import json into a dataframe
clean_csv_file = '{}{}.csv'.format(path_to_csv_file, csv_file)

In [5]:
'''Use dask to improve data loading
https://www.kaggle.com/shikhar1/yet-another-pandas-tutorial'''

# breaks with large json file
df = pd.read_csv(clean_csv_file)

In [6]:
df.shape

(1000, 10)

In [7]:
# remove Unnamed column
df = df.drop(columns = ['Unnamed: 0'])
df = df.drop(columns = ['original_date'])

In [8]:
df.columns

Index(['id', 'source', 'brief_title', 'condition', 'full_description',
       'summary', 'full_date', 'year'],
      dtype='object')

## Optimize memory usage

In [9]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

Index                    80
id                    68000
source                85638
brief_title          144719
condition             76732
full_description    1613753
summary              756582
full_date             67000
year                   8000
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
id                  1000 non-null object
source              1000 non-null object
brief_title         1000 non-null object
condition           1000 non-null object
full_description    1000 non-null object
summary             1000 non-null object
full_date           1000 non-null object
year                1000 non-null int64
dtypes: int64(1), object(7)
memory usage: 62.6+ KB


In [10]:
# Change data types
df['id'] = df['id'].astype('str')
for col in ['source', 'condition']:
    df[col] = df[col].astype('category')

In [11]:
df['full_date'] =  pd.to_datetime(df['full_date'])

In [12]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

Index                    80
id                    68000
source                80312
brief_title          144719
condition             77657
full_description    1613753
summary              756582
full_date              8000
year                   8000
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
id                  1000 non-null object
source              1000 non-null category
brief_title         1000 non-null object
condition           1000 non-null category
full_description    1000 non-null object
summary             1000 non-null object
full_date           1000 non-null datetime64[ns]
year                1000 non-null int64
dtypes: category(2), datetime64[ns](1), int64(1), object(4)
memory usage: 101.5+ KB


In [13]:
print(df.shape)
print(df.nunique())

(1000, 8)
id                  1000
source               662
brief_title          999
condition            703
full_description     665
summary              999
full_date            862
year                  20
dtype: int64


## List of conditions
- ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2019/

In [14]:
import xml.etree.ElementTree as ET

In [15]:
list_conditions = '../data/icd10/'

conditions = []

for xml in os.listdir(list_conditions):
    conditions.append(xml)

print(conditions)

['icd10cm_tabular_2019.xml']


In [16]:
icd = list_conditions + conditions[0] #full path
tree = ET.parse(icd).getroot()
print('Parsed: {}\n'.format(conditions[0]))

Parsed: icd10cm_tabular_2019.xml



In [17]:
#Save titles from xml file in a list
all_conditions = []
    
for i in tree.iter('desc'):
    all_conditions.append(i.text)
    
len(all_conditions)

44803

In [18]:
# Remove long names
result = []

def count_words_list(words,size):
    for w in words:
        if len(w.split())<=size:
            result.append(w)
    return result

count_words_list(all_conditions, 3)
len(result)

6163

In [19]:
# clean conditions dataset
all_conditions = [w.replace('unspecified', '') for w in all_conditions]
all_conditions = [w.replace(',', '') for w in all_conditions]

In [20]:
# Eliminate duplicated records
all_conditions = list(set(result))
len(all_conditions)

6088

## Classify docs using a list of conditions

In [21]:
# Create a column with all text
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [22]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### Check for values in conditions list

In [23]:
# Create dataframe to store vocabulary
vocabulary_conditions = pd.DataFrame()

In [None]:
# PENDING: faster function
# https://stackoverflow.com/questions/44960614/whats-the-fastest-way-to-acces-a-pandas-dataframe
for term in all_conditions:
    try:
        vocabulary_conditions[term] = df['text'].str.contains(term, case = False).astype(int)
        print(term)
    except:
        pass
              

In [None]:
vocabulary_conditions.info()
#463 Mb file

In [None]:
# Discard columns above a treshold
vocabulary_conditions.describe()

## ML Pipeline

### DF transformations
- Check which ones can be added to the preprocessor
- Info: http://www.ultravioletanalytics.com/blog/tf-idf-basics-with-pandas-scikit-learn

In [None]:
# Create a column combining title + full description
df['text'] = df['brief_title'] + df['summary'] + df['full_description']

In [None]:
# Clean extra blank spaces
df.text = df.text.replace('\s+', ' ', regex=True)

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [None]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 3000,
                     lowercase = True,
                     max_df = 0.5,
                     ngram_range = (1, 3))

In [None]:
docs = list(df['full_description'])

X = cv.fit_transform(docs)

In [None]:
len(cv.vocabulary_)

In [None]:
cv.vocabulary_

In [None]:
# Check for values in dictionary
if 'breast cancer' in list(cv.vocabulary_):
  print('yes')
else:
  print('no')

In [None]:
pd.DataFrame(X.toarray(), columns=cv.get_feature_names()).head()

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

In [None]:
# Pending: Reduced dimensionality



In [None]:
# turn weights data into a dataframe
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())

In [None]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(5)

In [None]:
# Check word
def check_weight(word):
    for w in word:
        try:
            print('{}: {}'.format(w, tf[w].mean()))
        except KeyError:
            print('{}: None'.format(w))

search_terms = ['cancer', 'breast cancer', 'migraine', 'sarcoma']
check_weight(search_terms)

In [None]:
# Merge df y tf-idf data by index
pd.set_option('display.max_columns', None) 
tf.sample(5)
# pd.merge(df, tf, left_index=True, right_index=True).sample(1)

### Topic Modeling
- https://nlpforhackers.io/topic-modeling/

### Test: Using Tensorflow 
- https://github.com/open-source-for-science/TensorFlow-Course?utm_campaign=explore-email&utm_medium=email&utm_source=newsletter&utm_term=weekly