# Prepare working db

In [1]:
import json, os
import numpy as np
import pandas as pd
import sqlite3
# import nltk

## Load json file with parsed information

In [2]:
# paths
path_to_json_file = os.path.abspath('../data/json/')

### json file
json_file = '/all_parsed_data_json' #name json file  
# json_file = '/sample_json' #sample json file for testing  

file = '{}{}.json'.format(path_to_json_file, json_file)

In [3]:
# load json file
df = pd.read_json(file)

In [None]:
# Display full dataframe
# pd.set_option('display.max_rows', 50)
# pd.set_option('display.max_columns', 50)
# pd.set_option('display.width', 1000)

In [5]:
df.shape

(311536, 24)

## Clean dataset

In [17]:
# change column type
def data_types(dataframe, cols = [], to_type = ''):
    for col in cols:
        dataframe[col] = df[col].astype(to_type)

In [18]:
columns_dates = ['study_first_submitted', 'last_update_submitted', 'verification_date']
data_types(df, columns_dates, 'datetime64')

In [19]:
# Extract year 
df['year_submitted'] = df['study_first_submitted'].dt.year
df['year_last_updated'] = df['last_update_submitted'].dt.year
df['year_verification'] = df['verification_date'].dt.year

In [20]:
# Remove unnecesary columns
df.drop(columns_dates, axis = 1, inplace=True)

In [21]:
columns_to_drop = ['study_first_posted', 'last_update_posted']
df.drop(columns_to_drop, axis = 1, inplace=True)

In [22]:
# Remove /n in all df
df = df.replace(r'\n',' ', regex=True)

In [23]:
# Create new column for all text
df['all_text'] = df['source'] + ' ' + df['brief_title'] + ' ' + df['condition'] + ' ' + df['condition_browse/mesh_term'] + ' '+ df['intervention_browse/mesh_term'] + ' '+ df['detailed_description/textblock'] + ' ' + df['brief_summary/textblock']

# All_text in lowercase
df['all_text'] = df['all_text'].str.lower()

# remove extra whitespace
df.all_text = df.all_text.replace('\s+', ' ', regex=True)


In [24]:
# Remove extra white space in summary
df['brief_summary/textblock'] = df['brief_summary/textblock'].replace('\s+', ' ', regex=True)
df['brief_summary/textblock'][0]

' This study was conducted to compare the activities of erlotinib to that of intravenous, platinum-based therapy in the treatment of non-small cell lung cancer (NSCLC). The goal of this trial was to demonstrate clinical equivalence of erlotinib to platinum-based frontline therapy, compared to historical controls. '

In [33]:
# # add url columns
url_string = 'https://clinicaltrials.gov/ct2/show/'
df['url'] = url_string + df['nct_id'].astype(str)

In [37]:
df.head()

Unnamed: 0,nct_id,source,brief_title,overall_status,study_type,phase,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,...,sponsors/lead_sponsor/agency,sponsors/lead_sponsor/agency_class,study_design_info/allocation,study_design_info/intervention_model,study_design_info/primary_purpose,year_submitted,year_last_updated,year_verification,all_text,url
0,NCT00391586,New Mexico Cancer Care Alliance,Erlotinib and Standard Platinum-Based Chemothe...,Terminated,Interventional,Phase 2,"Carcinoma, Non-Small-Cell Lung",Carcinoma,Paclitaxel,To compare the activities (the progress...,...,New Mexico Cancer Care Alliance,Other,,Single Group Assignment,Treatment,2006,2015,2015.0,new mexico cancer care alliance erlotinib and ...,https://clinicaltrials.gov/ct2/show/NCT00391586
1,NCT03472664,Wake Forest University Health Sciences,Brain Energy for Amyloid Transformation in Alz...,Recruiting,Interventional,,Alzheimer Disease,Alzheimer Disease,,This study will examine the effects of ...,...,Wake Forest University Health Sciences,Other,Randomized,Parallel Assignment,Treatment,2018,2018,2018.0,wake forest university health sciences brain e...,https://clinicaltrials.gov/ct2/show/NCT03472664
10,NCT01009658,Gunma University,MSG and Gastrointestinal Motility,Completed,Interventional,Phase 3,Gastroesophageal Reflux,Gastroesophageal Reflux,,Amino acids such as monosodium glutamat...,...,Gunma University,Other,Randomized,Crossover Assignment,Basic Science,2009,2015,2015.0,gunma university msg and gastrointestinal moti...,https://clinicaltrials.gov/ct2/show/NCT01009658
100,NCT03651011,Odense University Hospital,Navigated Laser In Branch Retinal Vein Occlusi...,Recruiting,Interventional,Phase 1,Branch Retinal Vein Occlusion,Macular Edema,,Purpose of the study In a 12-mon...,...,Odense University Hospital,Other,Randomized,Parallel Assignment,Treatment,2018,2019,2019.0,odense university hospital navigated laser in ...,https://clinicaltrials.gov/ct2/show/NCT03651011
1000,NCT02424045,Samsung Medical Center,"Bendamustine, Carboplatin and Dexamethasone (B...",Completed,Interventional,Phase 1,T-cell Lymphoma,Lymphoma,Dexamethasone,Peripheral T-cell lymphoma (PTCL) repre...,...,Samsung Medical Center,Other,,Single Group Assignment,Treatment,2015,2018,2007.0,"samsung medical center bendamustine, carboplat...",https://clinicaltrials.gov/ct2/show/NCT02424045


## Preprocess dataset
- Tokenize, Lemmatize / Stem
- Remove stopwords

In [None]:
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

# porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [None]:
# Stemming
# def stem_sentences(text):
#     tokens = text.split()
#     stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
#     return ' '.join(stemmed_tokens)

# df['stems'] = df['all_text'].apply(stem_sentences)

In [None]:
# Lemmatizing
def lemm_sentences(text):
    tokens = text.split()
    lemm_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemm_tokens)

df['lemmas'] = df['all_text'].apply(lemm_sentences)

In [None]:
# Remove stop words [not needed with countvectorizer]
# stop_words = stopwords.words('english')
# df['tokens'] = df['lemmas'].apply(lambda x: [item for item in x.split() if item not in stop_words])

In [None]:
df.lemmas[0:5]

## Bags of words

In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re

In [None]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 1800,
                     lowercase = True,
                     max_df = 0.8,
                     ngram_range = (1, 3))

In [None]:
docs = list(df['lemmas'])

X = cv.fit_transform(docs)

In [None]:
len(cv.vocabulary_)

In [None]:
word_counts = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [None]:
# add nct_id to word_counts df
nct_id = df['nct_id']

df_word_counts = pd.merge(nct_id, word_counts, left_index=True, right_index=True)

In [None]:
df_word_counts.head()

_______

## Export working dataset

In [None]:
# Check all dataframes in space
%who DataFrame

In [None]:
# Export datasets: df, df_word_counts (key: ntc_id)

path_to_working_datasets = os.path.abspath('../data/working_data')

try: 
    os.mkdir(path_to_working_datasets)
except:
    pass
    print('Error')

### Upload data to sqlite db

In [None]:
import sqlite3

conn = sqlite3.connect('../data/working_data/database.db')

conn

In [None]:
# upload df to database
df.to_sql('all_trials', con=conn)

In [None]:
# test db
df_results = pd.read_sql_query("SELECT * from all_trials;", conn)
df_results.head()

In [None]:
# Test string query
search_query = pd.read_sql_query("SELECT * from all_trials WHERE all_text LIKE '%breast cancer%';", conn)
len(search_query)

In [None]:
# upload count_vectorizer to database
# sqlite limit = 2000
df_word_counts.to_sql('word_counts', con=conn)

In [None]:
df_counts = pd.read_sql_query("SELECT * from word_counts;", conn)
df_counts.head()

In [None]:
# List databases
pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)

In [None]:
conn.close()

--------

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

In [None]:
# turn weights data into a dataframe
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())

In [None]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(10)

In [None]:
# Create new dataframe with nct-id and merge wth tf by index
pd.set_option('display.max_columns', None) 

df_tf = df['nct_id']
df_tf.head()

In [None]:
df_tf = pd.merge(df_tf, tf, left_index=True, right_index=True)

In [None]:
df_tf.head()

## Test merged dataframes with tf-idf results

In [None]:
# import random

# def compare_results():
#     row = random.randint(0,3000)
#     df_tf.drop('nct_id', axis = 1)
#     print('Random row: {}'.format(row))
#     return df_tf.loc[row], tf.loc[row]

In [None]:
# compare_results()

## Calculate correlation between docs

In [None]:
# calculate similary
similarity = tweights * tweights.T

In [None]:
# Save all data in a dataframe
df_docs_similarity = pd.DataFrame(similarity.toarray())

<b>To-do: Give a NTC-ID record, find similar documents
& return dataframe with results</b>

In [None]:
# Find more similar documents of a given record

def find_similar_docs(record, rate):
    trials_id = []
    treshold = rate
    similar_index = df_docs_similarity.iloc[record][df_docs_similarity.iloc[record] > treshold].index
    
    for i in similar_index.values:
        trials_id.append(i)
        
    print('{} similar trials with treshold {}'.format(len(trials_id), treshold))
    return df.iloc[trials_id]


In [None]:
find_similar_docs(0, 0.50)