In [2]:
import pandas as pd
import argparse
from datetime import datetime
import dateutil.relativedelta
import os

### Database Functions
database_functions is a module that we use to connect to OMOP database. It manages the server configurations and creates the connection object for querying. We use the dbf.query() and dbf.chunked_query() functions to manage our SQL queries. 

In [3]:
import database_functions as dbf

# Training Data

### Defining the Cutoff Date
We define the final date in the training dataset as the date that is 6 months prior to the last recorded death date.
Using the death table, we find the max value of from death_date and subtract 6 months from that date.

In [4]:
def get_cutoff_date():
    query = """
        SELECT MAX(death_datetime) as date
        FROM amalga.omop.death
    """
    output = dbf.query(query)
    
    date = output["date"][0].date()
    
    cutoff = date - dateutil.relativedelta.relativedelta(months=6)

    return cutoff

### Creating Dataset with cut off
Using the cutoff function above, we pull in the cut off date. We first filter the dataset down to include only people who have at least one visit prior to the cut off. All other persons and their records are ignored.
We then ignore all records that have an associated date that is greater than or equal to our cut off date.

In [8]:
def building_training_sets():

    #2018-08-24
    cutoff_date = get_cutoff_date().strftime("%Y-%m-%d")

    tables_dates = {
        'person': 'birth_datetime',
        'death': 'death_datetime',
        'drug_exposure': 'drug_exposure_start_date',
        'observation': 'observation_date',
        'observation_period': 'observation_period_start_date',
        'visit_occurrence': 'visit_start_date',
        'condition_occurrence': 'condition_start_date',
        'measurement': 'measurement_date',
        'procedure_occurrence': 'procedure_date',
    }

    for table in tables_dates.keys():
        output_file = f"data/latest/{table}.csv"
        test_set = f"""
            SELECT * 
            FROM amalga.omop.{table} tab
            WHERE EXISTS
                (
                    SELECT vis.person_id
                    FROM amalga.omop.visit_occurrence vis
                    WHERE 
                        vis.visit_start_date < '{cutoff_date}' AND
                        vis.person_id=tab.person_id
                ) AND
                tab.{tables_dates[table]} < '{cutoff_date}';
        """
        data = pd.read_sql_query(test_set, con=dbf.database_connection(), chunksize=8000000)
        chunk = 1
        
        for t in data:
            print (table, chunk)
            t.to_csv(f"data/latest/{table}_{chunk}.csv", index=False)
            chunk += 1


### Merging the Data
After this code runs, we merge the files into single tables using bash concatenate.

# Creating the Benchmark

### Defining the benchmark
For the mortality prediction challenge, we will ask participants to predict the mortality status of all patients who have had at least one visit in the month preceding the cutoff date. 

We will ask participants for predictions of patient mortality status for 7 days, 1 month, 3 months, and 6 months from the cutoff date. The participant models will be outputing a two column file that is all the patient_ids of patients who fit the above criteria and a score between 0 and 1 that represents the likelihood that the patient will have passed away in the target time-frame.

To create the benchmark prediction file, we need to find all patients who have at least one visit in the month preceding the cutoff date and to pull their death status from the death table for the specified timeframe.

In [10]:
def finding_benchmark_patients():
    
    # get the cut off date and turn into string
    cutoff_date = get_cutoff_date().strftime("%Y-%m-%d")
    
    # get the date that is one month before the cut off date
    prev_month_start = (get_cutoff_date() - dateutil.relativedelta.relativedelta(months=1)).strftime("%Y-%m-%d")
    
    
    query = f"""
        SELECT p.person_id, d.death_date
        FROM amalga.omop.person p OUTER JOIN amalga.omop.death d ON p.person_id = d.person_id
        WHERE
            EXISTS (
                SELECT v.person_id
                FROM amalga.omop.visit_occurrence v
                WHERE 
                    v.visit_start_date >= '{prev_month_start}' AND
                    v.visit_start_date < '{cutoff_date}'    
            )
    """
    return dbf.query(query)

Now that we've found all the patients that fit our criteria, we need to create a status file with the mortality status of patients for the specified tiemframe. The function below allows us to create mortality status files.

In [12]:
def building_goldstandard_files(timeframe):
    future_timeframe = get_cutoff_date() + dateutil.relativedelta.relativedelta(days=timeframe)
    
    def calc_death_status(x, timeframe):
        
        if x == None:
            return 0
        elif x < future_timeframe:
            return 1
        else:
            return 0

    benchmarks = finding_benchmark_patients()
    
    benchmarks["status"] = benchmarks["death_date"].apply(lambda x: calc_death_status(x, timeframe))
    return benchmarks[["person_id", "status"]]