## SQL Queries to Create Tables Used for Project

> The creation of different tables was done in AWS Athena and in some circumstances csv files
> were created in the AWS Sagemaker, saved in s3 bucket, to be recreated as tables in Athena 

In [None]:
!pip install PyAthena
from pyathena import connect
from pyathena.pandas.util import as_pandas


# Import libraries
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import boto3
from botocore.client import ClientError
from IPython.display import display, HTML
import pickle
%matplotlib inline


s3 = boto3.resource('s3')
client = boto3.client("sts")
account_id = client.get_caller_identity()["Account"]
my_session = boto3.session.Session()
region = my_session.region_name
athena_query_results_bucket = 'aws-athena-query-results-'+account_id+'-'+region

try:
    s3.meta.client.head_bucket(Bucket=athena_query_results_bucket)
except ClientError:
    bucket = s3.create_bucket(Bucket=athena_query_results_bucket)
    print('Creating bucket '+athena_query_results_bucket)
cursor = connect(s3_staging_dir='s3://'+athena_query_results_bucket+'/athena/temp').cursor()

## Diabetic Patients Cohort Table

In [None]:
cursor.execute(""" CREATE TABLE default.diabetic_patients_cohort
WITH (
  format='PARQUET'
) AS
SELECT admissions.subject_id, (min(admissions.admittime)) as admit_time, max(admissions.dischtime) as discharge_time, if(max(admissions.deathtime) is null,0,1) as mortality_flag
FROM mimiciii.admissions admissions 
WHERE admissions.subject_id in (SELECT DISTINCT diag.subject_id
                                FROM mimiciii.diagnoses_icd diag 
                                WHERE diag.icd9_code in (SELECT diags.icd9_code 
                                                         FROM mimiciii.d_icd_diagnoses diags 
                                                         WHERE diags.long_title LIKE '%iabetes%' AND diags.icd9_code not in ('3572', 'V771','V180', 'V1221')
                                                        )
                               ) 
AND DATE_DIFF('hour',admissions.admittime,admissions.dischtime)>48
GROUP BY admissions.subject_id
             """ )

### Retrieve Diabetic Patients Table from Athena to Split and Resample

In [None]:
query = 'select * from default.diabetic_patients_cohort'
cursor.execute(query)
cohort_patients_df = as_pandas(cursor)

### Python Code to Split Data and Resample Training Data

In [None]:
from sklearn.model_selection import train_test_split

train_cohort_df, test_cohort_df = train_test_split(cohort_patients_df, train_size=0.8,
                                                 random_state=230729, shuffle=True)

In [None]:
#check number of patients in each class
class_0 = train_cohort_df[train_cohort_df.mortality_flag==0]
class_1 = train_cohort_df[train_cohort_df.mortality_flag==1]

print(class_0.shape)
print(class_1.shape)

### Resample Training Set
> To resolve the problem of class imbalance, we resampled the training data using the resample
> class from sklearn.utils package


In [None]:
#Upsample the training data
from sklearn.utils import resample
## Upsampling to balance dataset
#separate class into majority and minority classes
df_majority = train_cohort_df[train_cohort_df.mortality_flag==0]
df_minority = train_cohort_df[train_cohort_df.mortality_flag==1]

#upsample the minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=class_0.shape[0],
                                random_state=230729)

#comibine majority dataframe with new minority dataframe
df_upsampled = pd.concat([df_majority,df_minority_upsampled])

#display new class counts
df_upsampled.mortality_flag.value_counts()

In [None]:
## Create New column in df_upsampled uniquely identifies each data
df_upsampled.reset_index(inplace=True)
df_upsampled.reset_index(inplace=True)
df_upsampled = df_upsampled.drop(columns=['index'])
df_upsampled = df_upsampled.rename(columns={"level_0":"new_subject_id"})

### Save train and test dataframes as csv files to s3 bucket

In [None]:
#save balanced train cohort to bucket
df_upsampled.to_csv('s3://'+athena_query_results_bucket+'/athena/temp/train2/train_cohort.csv', index=False)

In [None]:
#save test cohort to bucket
test_cohort_df.to_csv('s3://'+athena_query_results_bucket+'/athena/temp/test/test_cohort.csv', index=False)

## Balanced Training Set Cohort Table
- As a result of resampling to resolve the issue of class imbalance

In [None]:
cursor.execute(""" create external table if not exists train_cohort2 (
  new_subject_id int,
  subject_id int,
  admit_time timestamp,
  discharge_time timestamp,
  mortality_flag int
  )row format delimited
   fields terminated by ','
   escaped by '\\'
   lines terminated by '\n'
  location 's3://aws-athena-query-results-067114122515-us-east-1/athena/temp/train2/'
  tblproperties("skip.header.line.count"="1");
  """)

## Test Set Table

In [None]:
cursor.execute (""" create external table if not exists test_cohort (
  subject_id int,
  admit_time timestamp,
  discharge_time timestamp,
  mortality_flag int
  )row format delimited
   fields terminated by ','
   escaped by '\\'
   lines terminated by '\n'
  location 's3://aws-athena-query-results-067114122515-us-east-1/athena/temp/test/'
  tblproperties("skip.header.line.count"="1");
  """)