# Clinical trials: Data exploration


In [1]:
import time
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys
import logging

In [2]:
base_color = "#3298D0"
plot_size = (14, 10)

sys.stdout = open('/dev/stdout', 'w')

In [3]:
# Path for csv folder & file
path_to_csv_file = os.path.abspath('../data/csv/')

# json file
csv_file = '/clean_data'

## Import csv file

In [4]:
# Import json into a dataframe
clean_csv_file = '{}{}.csv'.format(path_to_csv_file, csv_file)

In [5]:
'''Use dask to improve data loading
https://www.kaggle.com/shikhar1/yet-another-pandas-tutorial'''

# breaks with large json file
df = pd.read_csv(clean_csv_file)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,original_date,source,brief_title,condition,full_description,summary,full_date,year
0,46038,NCT00004640,"September 17, 1999",University of Washington,"""Clinical Trials to Enhance Elders' Oral Healt...",Tooth Loss,"""TEETH"" is a double-blinded, randomized...",The purpose of this study is to determi...,1999-09-17,1999
1,150757,NCT00004639,"September 17, 1999",University of Florida,Cleft Palate Surgery and Speech Development,Cleft Lip,This study is conducted with patients w...,Compare the outcome of two primary surg...,1999-09-17,1999
2,43264,NCT00000341,"September 20, 1999",National Institute on Drug Abuse (NIDA),Evaluation of Liquid vs. Tablet Buprenorphine - 6,Opioid-Related Disorders,,The purpose of this study is to evaluat...,1999-09-20,1999
3,195656,NCT00000289,"September 20, 1999",National Institute on Drug Abuse (NIDA),Role of Metabolites in Nicotine Dependence (3)...,Tobacco Use Disorder,Previous studies have shown that cotini...,The purpose of this study is to determi...,1999-09-20,1999
4,528,NCT00000227,"September 20, 1999",University of Vermont,Alternate-Day Buprenorphine Administration. Ph...,Opioid-Related Disorders,,The purpose of this study is to evaluat...,1999-09-20,1999


In [7]:
# remove Unnamed column
df = df.drop(columns = ['Unnamed: 0'])
df = df.drop(columns = ['original_date'])

In [8]:
df.columns

Index(['id', 'source', 'brief_title', 'condition', 'full_description',
       'summary', 'full_date', 'year'],
      dtype='object')

## Optimize memory usage

In [9]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

In [10]:
df.dtypes

id                  object
source              object
brief_title         object
condition           object
full_description    object
summary             object
full_date           object
year                 int64
dtype: object

In [11]:
# Change data types
df['id'] = df['id'].astype('str')
for col in ['source', 'condition']:
    df[col] = df[col].astype('category')

In [12]:
df['full_date'] =  pd.to_datetime(df['full_date'])

In [13]:
print(df.memory_usage(deep=True))
df.info(verbose=True)

In [14]:
print(df.shape)
print(df.nunique())

## Method 1: List of terms
Test: Filtering by list of words related to cancer research. 

In [15]:
# 1st approach: use a list of terms as reference
cancer_words = ['cancer', 'oncology', 'melanoma', 'sarcoma']
pat = '|'.join([r'\b{}\b'.format(cw) for cw in cancer_words])

In [16]:
df['cancer'] = df['summary'].str.contains(pat, case = False).astype(int)

In [17]:
# Create a new dataset for records with cancer terms
df_cancer = df[df['cancer'] == 1]

In [18]:
df_cancer.nunique()

id                  36846
source               3688
brief_title         36689
condition            6284
full_description    26521
summary             36765
full_date            4752
year                   20
cancer                  1
dtype: int64

In [19]:
df_cancer['condition'].unique()

[Choroid Neoplasms, Cardiovascular Diseases, Bone Diseases, Colorectal Cancer, Carcinoma of Unknown Primary, ..., OVARIAN CANCER, Vulvar Cancer, G9a Protein, Locally Advanced Thymic Carcinoma, Metastatic Hepatocellular Carcinoma, Aromatase Inhibitor]
Length: 6284
Categories (6284, object): [Choroid Neoplasms, Cardiovascular Diseases, Bone Diseases, Colorectal Cancer, ..., Vulvar Cancer, G9a Protein, Locally Advanced Thymic Carcinoma, Metastatic Hepatocellular Carcinoma, Aromatase Inhibitor]

In [20]:
# Registries across diseases: Cancer & Cardiovascular disease
df_cancer.loc[(df_cancer['condition'] == 'Cardiovascular Diseases') & (df_cancer['cancer'] == 1)].info()

## Method 2: TD-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# vectorize words from full_description
