# Prepare working db
- Load data
- df cleaning
- Create label for recruting status
- Export full dataset to sqlite

In [1]:
import json, os
import numpy as np
import pandas as pd
import sqlite3
import shutil
import matplotlib.pyplot as plt
# import nltk

## Load json file with parsed information

In [2]:
# paths
path_to_json_file = os.path.abspath('../data/json/')

### json file
# json_file = '/all_parsed_data_json' #name json file  
json_file = '/sample_json' #sample json file for testing  

file = '{}{}.json'.format(path_to_json_file, json_file)

In [3]:
# load json file
df = pd.read_json(file)

In [4]:
# Display full dataframe
# pd.set_option('display.max_rows', 50)
# pd.set_option('display.max_columns', 50)
# pd.set_option('display.width', 1000)

In [5]:
# Check data
df.shape

(5984, 24)

In [6]:
# Remove all_files folder
path_to_all_files = os.path.abspath('../data/all_trials//')

def remove_extra_folders(folder):
    try:
        shutil.rmtree(folder)
        print("\nunzip folder deleted")
    except IOError as e:
        print(e)
        pass

In [7]:
%time remove_extra_folders(path_to_all_files)

[Errno 2] No such file or directory: '/Users/cmserna/Sites/clinical trials/mvp/data/all_trials'
CPU times: user 514 µs, sys: 654 µs, total: 1.17 ms
Wall time: 836 µs


## Format dataset

In [8]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,overall_status,verification_date,study_type,study_first_posted,last_update_submitted,last_update_posted,...,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip,sponsors/lead_sponsor/agency,sponsors/lead_sponsor/agency_class,study_design_info/allocation,study_design_info/intervention_model,study_design_info/primary_purpose
0,NCT01236404,"November 5, 2010",PhaseBio Pharmaceuticals Inc.,"Phase 1/2a, Randomized, Double-Blind, Placebo-...",Completed,May 2013,Interventional,"November 8, 2010","May 13, 2013","May 21, 2013",...,,\n Primary objective:\n\n To evaluat...,Walnut Creek,United States,94598,PhaseBio Pharmaceuticals Inc.,Industry,Randomized,Parallel Assignment,Treatment
1,NCT00275600,"January 10, 2006",Mayo Clinic,Randomized Clinical Trial of Vitamin E and Eve...,Completed,November 2009,Interventional,"January 12, 2006","November 25, 2009","November 26, 2009",...,\n Cyclical mastalgia is a common complai...,\n This study is being done to find out w...,Rochester,United States,55905,Mayo Clinic,Other,Randomized,Parallel Assignment,Treatment
10,NCT01810211,"March 8, 2013",Nova Southeastern University,The Most Effective Intervention for Reducing P...,Completed,March 2014,Interventional,"March 13, 2013","September 24, 2014","September 26, 2014",...,,"\n This study will look at which, if any,...",Raleigh,United States,27609,Nova Southeastern University,Other,Randomized,Parallel Assignment,Treatment
100,NCT00826111,"January 19, 2009",Steward St. Elizabeth's Medical Center of Bost...,The Effects of Eszopiclone and Lexapro on Pref...,Completed,June 2012,Interventional,"January 21, 2009","June 28, 2012","June 29, 2012",...,,\n The study examined the effects of addi...,Boston,United States,2135,Steward St. Elizabeth's Medical Center of Bost...,Other,Randomized,Parallel Assignment,Treatment
1000,NCT02813850,"June 23, 2016",Assistance Publique - Hôpitaux de Paris,Oxygen Therapy and Pregnancy in Sickle Cell Di...,Recruiting,March 2017,Interventional,"June 27, 2016","December 20, 2017","December 21, 2017",...,\n Sickle cell disease (SCD) corresponds ...,\n The purpose of this study is to assess...,Paris,France,75015,Assistance Publique - Hôpitaux de Paris,Other,Randomized,Parallel Assignment,Prevention


In [9]:
# function to change column type

def data_types(dataframe, cols = [], to_type = ''):
    for col in cols:
        dataframe[col] = df[col].astype(to_type)
        

In [11]:
df['verification_date'].sample(10)

904         July 2018
4523       April 2016
1455     October 2012
3481       March 2016
3106       March 2018
2783    December 2017
5164      August 2012
5579     October 2018
3566       March 2018
1361    December 2012
Name: verification_date, dtype: object

## Change date types, extract years

In [None]:
columns_dates = ['study_first_submitted', 'last_update_submitted', 'verification_date']
data_types(df, columns_dates, 'datetime64')

In [None]:
# Extract year 
df['year_submitted'] = df['study_first_submitted'].dt.year
df['year_last_updated'] = df['last_update_submitted'].dt.year
df['verification_year'] = df['verification_date'].dt.year

In [None]:
# Remove unnecesary columns
# df.drop(columns_dates, axis = 1, inplace=True)

In [None]:
# columns_to_drop = ['study_first_posted', 'last_update_posted', 'verification_date']
# df.drop(columns_to_drop, axis = 1, inplace=True)

In [None]:
# Remove /n in all df
df = df.replace(r'\n',' ', regex=True)

In [None]:
# Create new column for all text
df['all_text'] = df['source'] + ' ' + df['brief_title'] + ' ' + df['condition'] + ' ' + df['condition_browse/mesh_term'] + ' '+ df['intervention_browse/mesh_term'] + ' '+ df['detailed_description/textblock'] + ' ' + df['brief_summary/textblock']

# All_text in lowercase
df['all_text'] = df['all_text'].str.lower()

# remove extra whitespace
df.all_text = df.all_text.replace('\s+', ' ', regex=True)


In [None]:
# Remove extra white space in summary
df['brief_summary/textblock'] = df['brief_summary/textblock'].replace('\s+', ' ', regex=True)
df['brief_summary/textblock'][0]

In [None]:
# # add url columns
url_string = 'https://clinicaltrials.gov/ct2/show/'
df['url'] = url_string + df['nct_id'].astype(str)

## Recruiting status
- Create criteria for filtering recruiting status
- Create new column with labels

In [None]:
''' Recruiting status 
0 - Not recruting, including all records not updated 
1 - Possibily recruiting. Define time range for category 
2 - Recruiting

'''

df.overall_status.unique()

In [None]:
# Selecting open trials
# https://clinicaltrials.gov/ct2/help/glossary/recruitment-status

recruiting = df[(df['overall_status'] == "Recruiting") | \
                (df['overall_status'] == "Not yet recruiting") | \
                (df['overall_status'] == "Available for expanded access")]

# Check recruiting studies
print('Total of trials classified as recruiting: {}'.format(len(recruiting)))

print('Trials not verified in the last two years 2017-2019: {}'.\
      format(len(recruiting[recruiting['verification_year'] < 2017])))

In [None]:
recruiting.overall_status.unique()

In [None]:
# Trials by verification date

'''Unknown: A study with a status of Recruiting, Not yet recruiting, 
or Active, not recruiting and whose status has not been verified within the past 2 years. 

** Studies with an Unknown recruitment status are considered open studies or closed studies, 
depending on their last known recruitment status**

'''
recruiting.boxplot(column='verification_year', by='overall_status', \
                   figsize=(14,9), showfliers=False, meanline=True,\
                   whiskerprops = dict(linestyle='-.', linewidth=2))

# recruiting.boxplot(column='year_last_updated')
# recruiting['verification_year'].plot(kind='box', figsize=(10,7))
# recruiting['year_last_updated'].plot(kind='box', figsize=(10,7))

In [None]:
# Create new column with recruiting status based on verification date

In [None]:
# Function to assign recruiting labels
options = [
    (df['verification_year'] < 2017),
    (df['verification_year'] > 2017) &
    (df['year_last_updated'] > 2017) & 
    (df['overall_status'] == "Recruiting") | \
                (df['overall_status'] == "Not yet recruiting") | \
                (df['overall_status'] == "Available for expanded access")]


In [None]:
labels = [0, 1]
df['recruiting_labels'] = np.select(options, labels, default=0)
df['recruiting_labels'].value_counts()


In [None]:
all_recruiting = df[df['recruiting_labels'] == 1].sort_values('study_first_submitted')
print('Verification years: {}'.format(all_recruiting['verification_year'].unique()))
print('Verification dates: {}'.format(all_recruiting['verification_date'].unique()))
print('Year last udpated: {}'.format(all_recruiting['year_last_updated'].unique()))

In [None]:
all_recruiting['verification_date'].unique()

In [None]:
# all_recruiting.boxplot(column='recruiting_labels', by='sverification_date', \
#                    figsize=(14,9), showfliers=False, meanline=True,\
#                    whiskerprops = dict(linestyle='-.', linewidth=2))


# Create sqlite db

In [None]:
# Create folder

path_to_working_datasets = os.path.abspath('../data/working_data')

try: 
    os.mkdir(path_to_working_datasets)
except:
    pass
    print('Error')

In [None]:
# Create and connect to db
conn = sqlite3.connect('../data/working_data/working-database.db')
conn


In [None]:
# upload df to database: define schema
%time df.to_sql('all_trials', con=conn)

In [None]:
# get list of indexes
pd.read_sql_query("PRAGMA index_list(all_trials);", conn)

In [None]:
# test 1. Filter by condition
%time search_query = pd.read_sql_query("SELECT * from all_trials WHERE all_text LIKE '%breast cancer%';", conn)
search_query.head()

In [None]:
# test 2. Filter by recruiting status = 1
%time search_query = pd.read_sql_query("SELECT * from all_trials WHERE recruiting_labels == 1;", conn)
len(search_query)

In [None]:
# Create index on all_text column
c = conn.cursor()
%time c.execute("CREATE INDEX idx1 ON all_trials(all_text)")
conn.commit()

In [None]:
# get list of indexes
pd.read_sql_query("PRAGMA index_list(all_trials);", conn)

In [None]:
# test new query
%time search_query = pd.read_sql_query("SELECT * from all_trials WHERE all_text LIKE '%breast cancer%';", conn)
search_query.head()

In [None]:
%time new_query = pd.read_sql_query("SELECT nct_id from all_trials WHERE all_text LIKE '%breast cancer%';", conn).head()
new_query.head()

In [None]:
'''
Optimize performance
https://medium.com/@JasonWyatt/squeezing-performance-from-sqlite-indexes-indexes-c4e175f3c346
'''

_____

## Test queries speed

In [None]:
# Check data types
# pd.read_sql_query("PRAGMA table_info(all_trials);", conn)

In [None]:
# Time query with WHERE condition
%time search_query = pd.read_sql_query("SELECT * from all_trials WHERE all_text LIKE '%breast cancer%';", conn)
search_query.head()

In [None]:
# Time query with WHERE condition, return nct_id
%time filtered_query = pd.read_sql_query("SELECT nct_id from all_trials WHERE all_text LIKE '%breast cancer%';", conn)
filtered_query.head()

In [None]:
# drop original index
%time c.execute("DROP INDEX ix_all_trials_index")
conn.commit()


## Test queries after creating index

In [None]:
# Time query with WHERE condition after creating index
%time optimized_query = pd.read_sql_query("SELECT * from all_trials WHERE all_text LIKE '%breast cancer%';", conn)
optimized_query.head()

In [None]:
# Return only nct_id
%time new_query = pd.read_sql_query("SELECT nct_id from all_trials WHERE all_text LIKE '%breast cancer%';", conn).head()
new_query.head()


## Pending: Test search pattern to optimize queries, return results

## Preprocess dataset
- Tokenize, Lemmatize / Stem
- Remove stopwords

In [None]:
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

# porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [None]:
# Stemming
# def stem_sentences(text):
#     tokens = text.split()
#     stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
#     return ' '.join(stemmed_tokens)

# df['stems'] = df['all_text'].apply(stem_sentences)

In [None]:
# Lemmatizing
def lemm_sentences(text):
    tokens = text.split()
    lemm_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemm_tokens)

df['lemmas'] = df['all_text'].apply(lemm_sentences)

In [None]:
# Remove stop words [not needed with countvectorizer]
# stop_words = stopwords.words('english')
# df['tokens'] = df['lemmas'].apply(lambda x: [item for item in x.split() if item not in stop_words])

In [None]:
df.lemmas[0:5]

## Bags of words

In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re

In [None]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 1800,
                     lowercase = True,
                     max_df = 0.8,
                     ngram_range = (1, 3))

In [None]:
docs = list(df['lemmas'])

X = cv.fit_transform(docs)

In [None]:
len(cv.vocabulary_)

In [None]:
word_counts = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [None]:
# add nct_id to word_counts df
nct_id = df['nct_id']

df_word_counts = pd.merge(nct_id, word_counts, left_index=True, right_index=True)

In [None]:
df_word_counts.head()

_______

In [None]:
# Check all dataframes in space
%who DataFrame

In [None]:
# upload count_vectorizer to database
# sqlite limit = 2000
df_word_counts.to_sql('word_counts', con=conn)

In [None]:
df_counts = pd.read_sql_query("SELECT * from word_counts;", conn)
df_counts.head()

In [None]:
# List databases
pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)

In [None]:
conn.close()

--------

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

In [None]:
# turn weights data into a dataframe
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())

In [None]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(10)

In [None]:
# Create new dataframe with nct-id and merge wth tf by index
pd.set_option('display.max_columns', None) 

df_tf = df['nct_id']
df_tf.head()

In [None]:
df_tf = pd.merge(df_tf, tf, left_index=True, right_index=True)

In [None]:
df_tf.head()

## Test merged dataframes with tf-idf results

In [None]:
# import random

# def compare_results():
#     row = random.randint(0,3000)
#     df_tf.drop('nct_id', axis = 1)
#     print('Random row: {}'.format(row))
#     return df_tf.loc[row], tf.loc[row]

In [None]:
# compare_results()

## Calculate correlation between docs

In [None]:
# calculate similary
similarity = tweights * tweights.T

In [None]:
# Save all data in a dataframe
df_docs_similarity = pd.DataFrame(similarity.toarray())

<b>To-do: Give a NTC-ID record, find similar documents
& return dataframe with results</b>

In [None]:
# Find more similar documents of a given record

def find_similar_docs(record, rate):
    trials_id = []
    treshold = rate
    similar_index = df_docs_similarity.iloc[record][df_docs_similarity.iloc[record] > treshold].index
    
    for i in similar_index.values:
        trials_id.append(i)
        
    print('{} similar trials with treshold {}'.format(len(trials_id), treshold))
    return df.iloc[trials_id]


In [None]:
find_similar_docs(0, 0.50)