In [None]:
import psycopg2
import pandas as pd
import datetime
import numpy as np
from scipy.stats import mannwhitneyu
from scipy import stats

In [None]:
conn = psycopg2.connect(host="localhost", 
port="5432", 
user="postgres", 
password="abcdef12", 
database="mimic", 
options="-c search_path=mimiciii,public")

In [None]:

lab_query = "SELECT l.subject_id,l.hadm_id, d.label, l.valuenum, l.valueuom, l.charttime from labevents l inner join d_labitems d on l.itemid=d.itemid where l.valuenum is NOT NULL;"
vitals_query = "SELECT c.subject_id,c.hadm_id,d.label,  c.valuenum, c.valueuom ,d.category, c.charttime from chartevents_1 c inner join d_items d on c.itemid=d.itemid where c.valuenum is NOT NULL;"
admission_query = "SELECT subject_id,hadm_id from admissions;"
top_meds_query = "SELECT drug,  COUNT(drug) AS value_occurrence FROM  prescriptions GROUP BY drug ORDER BY value_occurrence DESC LIMIT 100;"
patient_presc_query = "SELECT subject_id,hadm_id,startdate,drug,dose_val_rx from prescriptions;"

In [None]:
def remove_multiple_admissions(df):
    """Removes hospital admissions that occur more than once for the same patient
  
    Parameters:
    df (DataFrame): Takes in dataframe with multiple hospital admissions
  
    Returns:
    Dataframe: Returns dataframe with multiple hospital admissions removed
  
    """
    first_admission_df = pd.read_sql_query(admission_query, conn)
    first_admission_df=first_admission_df.drop_duplicates(subset=['subject_id'], keep='first')
    df = df[df['hadm_id'].isin(first_admission_df['hadm_id'])]
    return df


In [None]:
lab_measurements_df = pd.read_sql_query(lab_query, conn)

In [None]:
top100_drugs_df = pd.read_sql_query(top_meds_query, conn)
patient_presc_df = pd.read_sql_query(patient_presc_query, conn)
patient_presc_df['startdate']+=datetime.timedelta(hours=12)

In [None]:
patient_presc_df=remove_multiple_admissions(patient_presc_df)
#Select patients taking the top 100 drugs
patient_presc_df = patient_presc_df[patient_presc_df['drug'].isin(top100_drugs_df['drug'])]

In [None]:
#To reduce size of the dataset
lab_measurements_df=lab_measurements_df[lab_measurements_df.duplicated(subset=['subject_id','label'],keep=False)]
lab_measurements_df = lab_measurements_df[lab_measurements_df['hadm_id'].isin(patient_presc_df['hadm_id'])]

In [None]:
def labpairing(drugname,prescdf,labdf,labname):
    """Pairs the drug input with each lab test

    Parameters:
    drugname (String): Drug Name
    prescdf (DataFrame): Dataframe containing the prescription data
    labdf (DataFrame): Dataframe containing the lab measurement data
    labname (DataFrame): Lab Test Name
    Returns:
    DataFrame: Contains all the rows of values and times for that particular drug lab apir
  
    """
    #Select patients who have taken the drug
    prescdf = prescdf[prescdf['drug']==drugname]
    prescdf=prescdf.drop_duplicates(subset=['subject_id'], keep='first')
    #Select lab measurements of patients who have taken the drug
    labdf = labdf[labdf['hadm_id'].isin(prescdf['hadm_id'])]
    #Selects the lab measurement entered
    drug_lab_specific = labdf[labdf['label']==labname]
    mergeddf=drug_lab_specific.merge(prescdf,on=['hadm_id','subject_id'])
    mergeddf["timefromprescription"]=mergeddf["charttime"]-mergeddf["startdate"]
    mergeddf = mergeddf[(mergeddf['timefromprescription']>datetime.timedelta(days=-1))&(mergeddf['timefromprescription']<datetime.timedelta(days=1))]
    posmergeddf=mergeddf.loc[mergeddf.timefromprescription > datetime.timedelta(days=0)]
    negmergeddf=mergeddf.loc[mergeddf.timefromprescription < datetime.timedelta(days=0)]
    #Only keep values for which we have both before and after
    posmergeddf=posmergeddf[posmergeddf['hadm_id'].isin(negmergeddf['hadm_id'])]
    negmergeddf=negmergeddf[negmergeddf['hadm_id'].isin(posmergeddf['hadm_id'])]
    #Select the values closest to 0
    posmergeddf=posmergeddf.loc[posmergeddf.groupby('subject_id').timefromprescription.idxmin()]
    negmergeddf=negmergeddf.loc[negmergeddf.groupby('subject_id').timefromprescription.idxmax()]
    finaldf=negmergeddf.merge(posmergeddf,on=['hadm_id','subject_id'])
    
    return finaldf

In [None]:
def postprocessing(df):
    """Gets the mean, standard deviation, mann whitney and t-test p values. Converts time delta to hours
  
    Parameters:
    df (DataFrame): Dataframe containing before and after lab test values and time values
    Returns:
    List:Containing mean, standard deviation, mann whitney and t-test p values and count
  
    """
    df['timefromprescription_x'] = pd.to_numeric(df['timefromprescription_x'].dt.seconds)
    df['timefromprescription_x']/=3600
    df['timefromprescription_y'] = pd.to_numeric(df['timefromprescription_y'].dt.seconds)
    df['timefromprescription_y']/=3600
    df_before_mean=df['valuenum_x'].mean()
    df_after_mean=df['valuenum_y'].mean()
    df_before_std=df['valuenum_x'].std()
    df_after_std=df['valuenum_y'].std()
    df_before_time_mean=df['timefromprescription_x'].mean()
    df_after_time_mean=df['timefromprescription_y'].mean()
    df_before_time_std=df['timefromprescription_x'].std()
    df_after_time_std=df['timefromprescription_y'].std()
    mannwhitneypvalue=mannwhitneyu(df['valuenum_x'], df['valuenum_y'])[1]
    ttestpvalue=stats.ttest_ind(df['valuenum_x'], df['valuenum_y'])[1]
    lengthofdf=len(df)
    csvrow=[lengthofdf,df_before_mean,df_before_std,df_before_time_mean,df_before_time_std,df_after_mean,df_after_std,df_after_time_mean,df_after_time_std,mannwhitneypvalue,ttestpvalue]
    return csvrow

 

In [None]:
csvdf=pd.DataFrame(columns=['Medication Name','Feature Name','Number of patients','Feature Before(mean)','Feature Before(std)','Time Before(mean)','Time Before(std)','Feature After(mean)','Feature After(std)','Time After(mean)','Time After(std)','Mannwhitney-pvalue','Ttest-pvalue'])
for j in lab_measurements_df.label.unique():
    for i in top100_drugs_df['drug']: 
        print(i,j)
        drug_lab_df=labpairing(i,patient_presc_df,lab_measurements_df,j)
        if(len(drug_lab_df)>25): 
            csvrow=postprocessing(drug_lab_df)
            csvrow.insert(0, i) 
            csvrow.insert(1, j)
            csvdf.loc[len(csvdf)] = csvrow
Potassium Chloride Alanine Aminotransferase (ALT)
Insulin Alanine Aminotransferase (ALT)


In [None]:
csvdf.to_csv('Drug_Value_Pairs_extended.csv')