## Script to create working aact db

In [None]:
import pandas as pd
import psycopg2
%load_ext sql

In [None]:
# connecting to aact database
%sql postgresql://cms:postgres@localhost:5432/aact

#### Test queries

In [None]:
# SELECT table_name FROM information_schema.tables \
# WHERE table_schema = 'ctgov'\
# order by table_name asc;

# select * from ctgov.all_keywords
# where all_keywords.names like '%celiac%';

# Save query in a dataframe
# %time df = %sql select * from ctgov.studies
# %time df = df.DataFrame()

# SELECT
# (select count(*) as count_mesh_terms from ctgov.mesh_terms), 
# (select count(*) as count_conditions from ctgov.conditions);

## Create new trials db
- Info on AACT schema: https://aact.ctti-clinicaltrials.org/data_dictionary
- Main table: ctgov.studies, ctgov.brief_summaries, ctgov.brief_summaries, ctgov.interventions, ctgov.conditions
- all_conditions: mesh_terms & conditions

In [None]:
# Check number of records in different tables

%time counts = %sql SELECT \
(select count(*) as all_trials from trials), \
(select count(*) as conditions from ctgov.conditions),\
(select count(*) as browse_conditions from ctgov.browse_conditions),\
(select count(*) as all_conditions from ctgov.all_conditions),\
(select count(*) as browse_interventions from ctgov.browse_interventions),\
(select count(*) as brief_summaries from ctgov.brief_summaries)

counts

### Create trials table

In [1]:
# %sql 
# CREATE TABLE trials as
# SELECT 
#     nct_id, 
#     phase, 
#     study_first_submitted_date, 
#     study_first_submitted_qc_date,
#     "study_first_submitted_qc_date"::date - "study_first_submitted_date"::date AS submitted_to_qc,
#     study_first_posted_date,
#     results_first_submitted_date is not null as results,
#     study_type,
#     overall_status,
#     why_stopped is not null as stopped,
#     why_stopped,
#     has_expanded_access is true as has_expanded_access,
#     is_fda_regulated_drug,
#     is_fda_regulated_device,
#     is_unapproved_device,
#     official_title,
#     acronym,
#     source
#     FROM ctgov.studies;

### Add column with brief_description

In [None]:
# Find trials with null values
%sql SELECT \
   nct_id, \
   'not in summaries' as note \
FROM \
   trials \
EXCEPT \
   SELECT \
    nct_id, \
    'not in summaries' as note \
  FROM \
    ctgov.brief_summaries

In [None]:
# Create column for description
ALTER TABLE trials
ADD COLUMN description text;

In [None]:
# Select brief_summaries to trials table
UPDATE trials AS t1 
SET description = t2.brief_summaries
FROM brief_summaries AS t2
WHERE t1.nct_id = t2.nct_id

# JOIN
# %sql select * from trials as t1 \
# left join ctgov.brief_summaries as t2 \
# on t1.nct_id = t2.nct_id

In [None]:
# Check
select count(*) from trials
where description is null

### Add column with interventions

In [None]:
# Find trials with null values
%sql SELECT \
   nct_id, \
   'not in summaries' as note \
FROM \
   trials \
EXCEPT \
   SELECT \
    nct_id, \
    'not in summaries' as note \
  FROM \
    ctgov.interventions

In [None]:
UPDATE trials AS t1 
SET description = t2.brief_summaries
FROM brief_summaries AS t2
WHERE t1.nct_id = t2.nct_id

In [None]:
# Table with mesh_terms and conditions
# Info: there are several rows per NCT_ID in mesh terms and conditions
CREATE TABLE all_conditions as
SELECT 
    mesh_terms.downcase_mesh_term, conditions.downcase_name, conditions.nct_id
FROM 
    ctgov.mesh_terms
INNER JOIN 
    ctgov.conditions 
ON 
    mesh_terms.id = conditions.id;


### Add column with conditions [search]

In [None]:
# Find trials with null values
%sql SELECT \
   nct_id, \
   'not in summaries' as note \
FROM \
   trials \
EXCEPT \
   SELECT \
    nct_id, \
    'not in summaries' as note \
  FROM \
    ctgov.conditions