# Pre-defined searches

In [4]:
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import psycopg2
import os

In [12]:
# Change data types
#
# ALTER TABLE trials_12_1_19
# ALTER COLUMN nct_id TYPE varchar(11),
# ALTER COLUMN study_first_submitted TYPE date USING study_first_submitted::date,
# ALTER COLUMN verification_date TYPE date USING verification_date::date,
# ALTER COLUMN study_first_posted TYPE date USING study_first_posted::date,
# ALTER COLUMN last_update_submitted TYPE date USING last_update_submitted::date,
# ALTER COLUMN last_update_posted TYPE date USING last_update_posted::date,
# ALTER COLUMN recruiting_labels TYPE smallint;


# Drop columns
#
# ALTER TABLE table_name 
# DROP COLUMN column_name;

# Get size of table, number of rows
#
# SELECT pg_size_pretty( pg_total_relation_size('trials_12_1_19') ),
# count(*) from trials_12_1_19; 

# Create table from select statement
# CREATE TABLE new_table
# as 
# SELECT *
# from trials_12_1_19
# WHERE all_text like '%term%'

### Sqlite connection

In [10]:
# Connect to sqlite
import sqlite3
connsql = sqlite3.connect('../data/working_data/working-database.db')
connsql

<sqlite3.Connection at 0x10e4cb1f0>

In [11]:
# get list of indexes
pd.read_sql_query("PRAGMA index_list(all_trials);", connsql)

Unnamed: 0,seq,name,unique,origin,partial
0,0,idx1,0,c,0
1,1,ix_all_trials_index,0,c,0


In [13]:
# put all sqlite data into a df
%time sq = pd.read_sql_query("SELECT * from all_trials", connsql)

CPU times: user 144 ms, sys: 115 ms, total: 259 ms
Wall time: 1.97 s


## Connect to postgres

In [21]:
# Local postgres

#macbook air, main postgres
host = 'postgresql://cms@localhost:5432/' 

#cpj laptop
host_cpj = 'postgresql://cmserna@localhost:5432/' 

db = 'clinicaltrials'
connection = host_cpj + db

engine = create_engine(connection)

In [22]:
# sqlite df to postgres table
sq.to_sql('trials_12_1_19', engine)

In [24]:
# Create a df with all data
%time df = pd.read_sql_query('select * from "trials_12_1_19"',con=engine)
df.shape

CPU times: user 116 ms, sys: 35.8 ms, total: 152 ms
Wall time: 283 ms


(5984, 32)

## Query db over ssh tunnel

In [None]:
# https://gist.github.com/amirziai/9cc792e41d3241a17a18

## Create df with pre-defined search

### Celiac

In [None]:
### Run in pgAdmin4
# CREATE TABLE celiac AS
# SELECT *
# FROM trials_12_1_19
# WHERE all_text LIKE '%celiac%'

In [None]:
%time celiac = pd.read_sql_query('select * from "celiac"',con=engine)
celiac.shape

In [None]:
celiac.head()

### Breast Cancer

In [None]:
## Run in pgAdmin4
# select * from trials_12_1_19 
# WHERE all_text LIKE '%breast%' 
# and
# all_text LIKE '%cancer%';

In [None]:
%time breast_cancer = pd.read_sql_query('select * from "breastcancer"',con=engine)
breast_cancer.shape

In [None]:
breast_cancer.head()

In [None]:
# Compare dataframes with main one (main search term) and remove duplicates
# https://stackoverflow.com/questions/20225110/comparing-two-dataframes-and-getting-the-differences

## Function to search in a list of terms

In [None]:
# Postgres table to store pre-defined searches
# CREATE TABLE search_terms(
#    search_term TEXT UNIQUE NOT NULL,
#    return_table VARCHAR (50) NOT NULL
# );

## INSERT DATA MANUALLY
# INSERT INTO search_terms (search_term, return_table)
# VALUES ( 'breast cancer', 'breastcancer');

# ALTER TABLE <table_name> ADD PRIMARY KEY (id);

In [None]:
# Basic search logic
search_table = "SELECT * FROM search_terms"
partial_query = pd.read_sql_query(search_table, con = engine)

In [None]:
partial_query

In [None]:
# Find match in table
# https://davidhamann.de/2017/06/26/pandas-select-elements-by-string/

search_value = 'breast cancer'
# Function for string formatting
result_value = search_value.lower()
print(result_value)


In [None]:
table_value = partial_query[partial_query['search_term'].str.match(result_value)]
table_value = table_value.return_table.values[0]
print(table_value)

In [None]:
full_query = "SELECT * FROM " + str(table_value)

df = pd.read_sql_query(full_query, con = engine)

In [None]:
df.shape

In [None]:
# Python function to search in search_terms table
# return name of table for full query

search_value = 'bananas'
empty_query = "SELECT * FROM celiac limit 0"

try:
    table_value = partial_query[partial_query['search_term'].str.match(result_value)]
    table_value = partial_query[partial_query['search_term'].str.match(result_value)]
    table_value = table_value.return_table.values[0]
    full_query = "SELECT * FROM " + str(table_value)
    df = pd.read_sql_query(full_query, con = engine)
except:
    df = pd.read_sql_query(empty_query, con = engine)
    
df.head()

In [None]:
# Pending: Refine logic for string matching
# https://stackabuse.com/python-check-if-string-contains-substring/

## Data annotation
- Extract entities and cross-references
- Determine which external results to add: pubmed, fda...

In [None]:
# Check for nct_id duplicates
uniques = df.nct_id.nunique()
totals = df.shape[0]

if uniques == totals:
    print('No duplicates')
else:
    print('duplicates')

In [None]:
# Check for trials with published results
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961767/

## Baseline model: fastext, PubMed 200k RCT

In [None]:
# https://github.com/jakelever/kindred