In [None]:
import pandas as pd
import re 
import sqlite3
import numpy as np
import ast
import sklearn 
import math 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
conn = sqlite3.connect('lowva\lowva.db')
conn.text_factory = str
cur = conn.cursor()

In [None]:
dfcohort=pd.read_sql_query('''select pat_deid, lowvadate from outcome''', conn)
dfcohort["lowvadate"]=pd.to_datetime(dfcohort["lowvadate"])
dfcohort.head()

# Numeric Variables (From Eye Exam)

## Visual Acuity - done

In [None]:
dfexam=pd.read_sql_query('''select pat_deid, exam_date, bcvalogmarod, bcvalogmaros from examva
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam.head()
len(dfexam)

In [None]:
#what we want is a dataframe where for every pat_deid, lowvadate
#we have a long list of preindex bcvalogmars for the appropriate eye 
dfexam=pd.merge(dfexam, dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)
#normalize first 
dfexam["bcvalogmarod"]=(dfexam["bcvalogmarod"]-dfexam["bcvalogmarod"].mean())/dfexam["bcvalogmarod"].std()
dfexam["bcvalogmaros"]=(dfexam["bcvalogmaros"]-dfexam["bcvalogmaros"].mean())/dfexam["bcvalogmaros"].std()

dfexam.head(20)

In [None]:
featurevariable="bcvalogmaros" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

def getmed(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: med = np.percentile(np.array(valuelistnonan), 50)
    except: med=np.nan
    return med 

def getrecent(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: recent=valuelistnonan[-1]     
    except: recent=np.nan 
    return recent 

In [None]:
gethi(3178,pd.to_datetime("2016-11-11"))
getlo(3178,pd.to_datetime("2016-11-11"))
getmed(3178,pd.to_datetime("2016-11-11"))
getrecent(3178,pd.to_datetime("2016-11-11"))


#getlo(1861,pd.to_datetime("2016-11-30"))
#getmed(1861,pd.to_datetime("2016-12-14"))
#getrecent(4659,pd.to_datetime("2016-12-07"))

In [None]:
dfvafeatures=dfcohort[["pat_deid", "lowvadate"]]

dfvafeatures["bcvalogmarodbest"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfvafeatures["bcvalogmarodworst"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dfvafeatures["bcvalogmarodmed"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dfvafeatures["bcvalogmarodlast"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)


In [None]:
#change featurevariable = bcvalogmaros and rerun the function definitions 

dfvafeatures["bcvalogmarosbest"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfvafeatures["bcvalogmarosworst"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dfvafeatures["bcvalogmarosmed"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dfvafeatures["bcvalogmaroslast"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)


In [None]:
dfvafeatures.head(20)

In [None]:
dfcohort[dfcohort["pat_deid"]==3178]
dfexam[dfexam["pat_deid"]==3178]
dfvafeatures[dfvafeatures["pat_deid"]==3178]

## IOPs - done

In [None]:
dfexam=pd.read_sql_query('''select * from examiop''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam.head()
len(dfexam)

In [None]:
dfexam=pd.merge(dfexam, dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True   )
dfexam=dfexam[dfexam["tmethod"]!="null"]
dfexam.head(20)

In [None]:
def getmaxt(stringlist): 
    try: 
        tlist=ast.literal_eval(stringlist)
    except: 
        return np.nan 
    numlist=[] 
    for item in tlist: 
        try: 
            itemint=int(item)
            numlist.append(itemint)
        except: continue  
    try: 
        maxt=max(numlist)
    except:
        maxt=np.nan
    return maxt 

dfexam["todmax"]=dfexam["tod"].apply(getmaxt)
dfexam["tosmax"]=dfexam["tos"].apply(getmaxt)
dfexam.head(20)

In [None]:
#normalize 
dfexam["todmax"]=(dfexam["todmax"]-dfexam["todmax"].mean())/dfexam["todmax"].std()
dfexam["tosmax"]=(dfexam["tosmax"]-dfexam["tosmax"].mean())/dfexam["tosmax"].std()

In [None]:
featurevariable="tosmax" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

def getmed(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: med = np.percentile(np.array(valuelistnonan), 50)
    except: med=np.nan
    return med 

def getrecent(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: recent=valuelistnonan[-1]     
    except: recent=np.nan 
    return recent 

In [None]:
dftfeatures=dfcohort[["pat_deid", "lowvadate"]]
dftfeatures["todlo"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dftfeatures["todhi"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dftfeatures["todmed"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dftfeatures["todlast"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)
dftfeatures.head()

In [None]:
#reset featurevariable 
dftfeatures["toslo"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dftfeatures["toshi"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dftfeatures["tosmed"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dftfeatures["toslast"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)

In [None]:
dftfeatures.head(20)

## CCT - done

In [None]:
dfexam=pd.read_sql_query('''select * from examcct''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["cctdate"])
del dfexam["cctdate"]
#dfexam.head()
len(dfexam)

In [None]:
dfexam=pd.merge(dfexam, dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)

#remove outliers - e.g., 26 is clearly a typo as cct is usually a few hundred microns 
dfexam["cctod"]=np.where(dfexam["cctod"]<300, np.nan, dfexam["cctod"])
dfexam["cctos"]=np.where(dfexam["cctos"]<300, np.nan, dfexam["cctos"])

#normalize 
dfexam["cctod"]=(dfexam["cctod"]-dfexam["cctod"].mean())/dfexam["cctod"].std()
dfexam["cctos"]=(dfexam["cctos"]-dfexam["cctos"].mean())/dfexam["cctos"].std()


dfexam.head(20)

In [None]:
featurevariable="cctos" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

def getmed(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: med = np.percentile(np.array(valuelistnonan), 50)
    except: med=np.nan
    return med 

def getrecent(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: recent=valuelistnonan[-1]     
    except: recent=np.nan 
    return recent 

In [None]:
dfcctfeatures=dfcohort[["pat_deid", "lowvadate"]]
dfcctfeatures["cctodlast"]=dfcctfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)

In [None]:
#reset feature variable and rerun the functions 
dfcctfeatures["cctoslast"]=dfcctfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)

In [None]:
dfexam[dfexam["pat_deid"]==63923]

In [None]:
dfcctfeatures.head(20)
len(dfcctfeatures)
#many are missing because this just wasnt measured 

## Refraction - done

In [None]:
dfexam=pd.read_sql_query('''select * from examrx''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam.head()
len(dfexam)

In [None]:
#this needs to be treated a bit diferently. We want the most myopic spherical equivalent per eye 
dfexam["spheqvod"]=dfexam[["wrxodspheqv", "mrxodspheqv", "finalrxodspheqv"]].min(axis=1)
dfexam["spheqvos"]=dfexam[["wrxosspheqv", "mrxosspheqv", "finalrxosspheqv"]].min(axis=1)
#dfexam.head()

In [None]:
dfexam=pd.merge(dfexam[["pat_deid", "exam_date", "spheqvod", "spheqvos"]], dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)
dfexam["spheqvod"]=(dfexam["spheqvod"]-dfexam["spheqvod"].mean())/dfexam["spheqvod"].std()
dfexam["spheqvos"]=(dfexam["spheqvos"]-dfexam["spheqvos"].mean())/dfexam["spheqvos"].std()

#dfexam.head(20)

In [None]:
featurevariable="spheqvos"
def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    try: lo=min(valuelist) 
    except: lo=np.nan 
    return lo 

In [None]:
dfrxfeatures=dfcohort[["pat_deid", "lowvadate"]]

dfrxfeatures["rxodminus"]=dfrxfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)

In [None]:
#reset featurevariable 
dfrxfeatures["rxosminus"]=dfrxfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)


In [None]:
dfrxfeatures.head(20)
len(dfrxfeatures)

## CDR - done 

In [None]:
dfexam=pd.read_sql_query('''select pat_deid, exam_date, feodcdr, feoscdr from examcdr''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam=dfexam[~((dfexam["feodcdr"].isnull()) & (dfexam["feoscdr"].isnull()))]
dfexam.head()
len(dfexam)

In [None]:
dfexam=pd.merge(dfexam[["pat_deid", "exam_date", "feodcdr", "feoscdr"]], dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)

dfexam["feodcdr"]=(dfexam["feodcdr"]-dfexam["feodcdr"].mean())/dfexam["feodcdr"].std()
dfexam["feoscdr"]=(dfexam["feoscdr"]-dfexam["feoscdr"].mean())/dfexam["feoscdr"].std()


dfexam.head(20)

In [None]:
len(dfexam["pat_deid"].unique())

In [None]:
featurevariable="feoscdr" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

In [None]:
dfcdrfeatures=dfcohort[["pat_deid", "lowvadate"]]
dfcdrfeatures["cdrodbest"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfcdrfeatures["cdrodworst"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)



In [None]:
#reset featurevariable 
dfcdrfeatures["cdrosbest"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfcdrfeatures["cdrosworst"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)


In [None]:
dfcdrfeatures.head(20)
len(dfcdrfeatures)
#this field has a lot of missing data because not all providers use it - many free-text this finding into their notes 

In [None]:
len(dfcdrfeatures[~((dfcdrfeatures["cdrodbest"].isnull()) & 
              (dfcdrfeatures["cdrosbest"].isnull()) & 
              (dfcdrfeatures["cdrodworst"].isnull()) & 
              (dfcdrfeatures["cdrosworst"].isnull()))])

## Combine the structured exam features into one matrix

In [None]:
#first we are going to get rid of all the redundant lowvadates 
del dfvafeatures["lowvadate"]
del dftfeatures["lowvadate"]
del dfcctfeatures["lowvadate"]
del dfrxfeatures["lowvadate"]
del dfcdrfeatures["lowvadate"]


dfexamstructured=pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(dfcohort[["pat_deid"]], dfvafeatures, on=["pat_deid"], how="outer"), 
         dftfeatures, on=["pat_deid"], how="outer"), 
        dfcctfeatures, on=["pat_deid"], how="outer"), 
                  dfrxfeatures, on=["pat_deid"], how="outer"),
         dfcdrfeatures, on=["pat_deid"], how="outer")

In [None]:
dfexamstructured.head()

In [None]:
dfexamstructured.dtypes

# Coded Variables 

## Medications - done

### Turn long dataframe to wide and filter out near zero variance features

In [None]:
dfmeds=pd.read_sql_query('''select * from medslong''', conn) 
dfmeds.columns = map(str.lower, dfmeds.columns)
dfmeds["rx_date"]=pd.to_datetime(dfmeds["rx_date"])
#dfmeds.head()
len(dfmeds)

In [None]:
dfmeds=pd.merge(dfmeds,dfcohort[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid")
dfmeds.sort_values(by=["pat_deid", "rx_date"], ascending=True, inplace=True)
dfmeds=dfmeds[dfmeds["rx_date"]<=dfmeds["lowvadate"]]
#dfmeds.head()

In [None]:
dfmeds["pivotvalue"]=1
dfmeds["medication_id"]=dfmeds["medication_id"].astype(int)
dfmedswide=dfmeds.pivot_table(values="pivotvalue", index=['pat_deid'], columns='medication_id', fill_value=0)
dfmedswide.columns = ['med_'+str(col) for col in dfmedswide.columns.values]
dfmedswide.head()

In [None]:
dfmedswide.shape

In [None]:
#let's filter out near zero variance features for the medications
from sklearn.feature_selection import VarianceThreshold
selector=VarianceThreshold(.99 * (1 - .99))

selector.fit_transform(np.array(dfmedswide.loc[:, 'med_1':'med_590201'])).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dfmedsfiltered=variance_threshold_selector(dfmedswide.loc[:, 'med_1':'med_590201'], .99 * (1 - .99))

In [None]:
dfmedsfiltered.reset_index(inplace=True)

In [None]:
dfmedsfiltered.head()

## Diagnoses - done

In [None]:
dfdx=pd.read_sql_query('''select * from dxlong''', conn) 
dfdx.columns = map(str.lower, dfdx.columns)
dfdx["dx_date"]=pd.to_datetime(dfdx["dx_date"])
dfdx.head()
len(dfdx)

In [None]:
#one of the issues is that sometimes icd9 is missing, and sometimes icd10 is missing. Let's create a combined column 
dfdx["icd9_list"]="icd9_"+dfdx["icd9_list"].astype(str)
dfdx["icd10_list"]="icd10_"+dfdx["icd10_list"].astype(str)
dfdx["icd"]=np.where(dfdx["icd10_list"]=="icd10_None", dfdx['icd9_list'], dfdx["icd10_list"])

In [None]:
dfdx.head()

In [None]:
dfdxwide=dfdx.pivot_table(values="pivotvalue", index=['pat_deid'], columns='icd', fill_value=0)
dfdxwide.head()

In [None]:
selector=VarianceThreshold(.99 * (1 - .99))

selector.fit_transform(np.array(dfdxwide)).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dfdxfiltered=variance_threshold_selector(dfdxwide, .99 * (1 - .99))

In [None]:
dfdxfiltered.reset_index(inplace=True)

## Surgeries

In [None]:
dfcpt=pd.read_sql_query('''select * from cpt''', conn) 
dfcpt.columns = map(str.lower, dfcpt.columns)
dfcpt=pd.merge(dfcohort["pat_deid"], dfcpt, on="pat_deid", how="left").fillna(0)
dfcpt.set_index("pat_deid", inplace=True)
dfcpt.head()
len(dfcpt)

In [None]:
selector=VarianceThreshold(.99 * (1 - .99))

selector.fit_transform(np.array(dfcpt)).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dfcptfiltered=variance_threshold_selector(dfcpt, .99 * (1 - .99))

In [None]:
dfcptfiltered.reset_index(inplace=True)

## Demographics

dfpt=pd.read_sql_query('''select d.pat_deid, o.lowvadate - d.birth_date as age, d.gender as gender_Female, d.race as race_asian, d.race as race_white, d.race as race_black, d.race as race_Pacific_Islander,d.race as race_Native_American, d.race as race_other, d.race as race_unknown, d.ethnicity as Ethnicity_Non_Hispanic,d.ethnicity as Ethnicity_Hispanic from demographics as d, outcome as o where d.pat_deid = o.pat_deid
''', conn)
dfpt.columns = map(str.lower, dfpt.columns)
dfpt['gender_female'] = (dfpt['gender_female'] == 'Female').astype(int)
dfpt['race_asian'] = (dfpt['race_asian'] == 'Asian').astype(int)
dfpt['race_white'] = (dfpt['race_white'] == 'White').astype(int)
dfpt['race_black'] = (dfpt['race_black'] == 'Black').astype(int)
dfpt['race_pacific_islander'] = (dfpt['race_pacific_islander'] == 'Pacific Islander').astype(int)
dfpt['race_native_american'] = (dfpt['race_native_american'] == 'Native_American').astype(int)
dfpt['race_other'] = (dfpt['race_other'] == 'Other').astype(int)
dfpt['race_unknown'] = (dfpt['race_unknown'] == 'Unknown').astype(int)
dfpt['ethnicity_non_hispanic'] = (dfpt['ethnicity_non_hispanic'] == 'Non-Hispanic').astype(int)
dfpt['ethnicity_hispanic'] = (dfpt['ethnicity_hispanic'] == 'Hispanic/Latino').astype(int)
#dfpt['ethnicity_unkown'] = (dfpt['ethnicity_unknown'] == 'Unknown').astype(int)

dfpt.head()

dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
from datetime import timedelta, date
future = dfpt['birth_date'] > date(year=2010,month=1,day=1) #specifies the cutoff year
dfpt.loc[future, 'birth_date'] -= timedelta(days=365.25*100)
dfpt.head()

dfpt["age"].mean()
dfpt["age"].std()

#normalize age
dfpt["agestandard"]=(dfpt["age"]-dfpt["age"].mean())/dfpt["age"].std()
#check and make sure it worked 
dfpt["agestandard"].mean()
dfpt["agestandard"].std()

dfpt=pd.read_sql_query('''select demographics.pat_deid, birth_date, gender, race, ethnicity 
from demographics''',conn)

dfpt["raceth"]=np.where(dfpt["ethnicity"]=="Hispanic/Latino", "Hispanic", dfpt["race"])
dfpt.loc[dfpt.raceth == "Pacific Islander", 'raceth'] = "Asian"
dfpt.loc[dfpt.raceth == "Native American", 'raceth'] = "Other"
dfpt.loc[dfpt.raceth == "Unknown", 'raceth'] = "Other"

dfpt["raceth"].value_counts()
dfpt["raceth"].value_counts()/5612
dfpt["gender"].value_counts()

In [None]:

dfpt=pd.read_sql_query('''select demographics.pat_deid, birth_date, gender, race, ethnicity, lowvadate from demographics, outcome where outcome.pat_deid = demographics.pat_deid''',conn)
dfpt.columns = map(str.lower, dfpt.columns)
dfpt["lowvadate"]=pd.to_datetime(dfpt["lowvadate"])
dfpt.head()
dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
from datetime import timedelta, date
def fix_date(x):
    if x.year >=2000:
        year = x.year - 100
    else:
        year = x.year
    return date(year,x.month,x.day)

dfpt['birth_date'] = dfpt['birth_date'].apply(fix_date)
dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
dfpt.head()
dfpt["raceth"]=np.where(dfpt["ethnicity"]=="Hispanic/Latino", "Hispanic", dfpt["race"])
dfpt["raceth"].value_counts()
dfpt.loc[dfpt.raceth == "Pacific Islander", 'raceth'] = "Asian"
dfpt.loc[dfpt.raceth == "Native American", 'raceth'] = "Other"
dfpt.loc[dfpt.raceth == "Unknown", 'raceth'] = "Other"
dfpt['age']=dfpt["lowvadate"].dt.year-dfpt["birth_date"].dt.year
dfpt["raceth"].value_counts()
dfpt.head()
dfpt.dtypes


In [None]:
#standardize age
dfpt["agestandard"]=(dfpt["age"]-dfpt["age"].mean())/dfpt["age"].std()
#check and make sure it worked 
dfpt["agestandard"].mean()
dfpt["agestandard"].std()

del dfpt["race"]
del dfpt["ethnicity"]

dfpt=pd.get_dummies(dfpt)

dfpt.head()

#clean up original variables by deleting them 
del dfpt["birth_date"]
del dfpt["age"]
del dfpt["lowvadate"]

# Final merge of standardized structured exam features with demographics, and medications,  diagnoses,and surgeries (nzv filtered)

In [None]:
dfoutcome=pd.read_sql_query('''select pat_deid, outcome from outcome''', conn)

In [None]:
dfstructured=pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(dfoutcome, dfpt, on="pat_deid", how="outer"),
         dfmedsfiltered, on="pat_deid", how="outer").fillna(0),
        dfdxfiltered,on="pat_deid", how="outer").fillna(0),
        dfcptfiltered,on="pat_deid", how="outer").fillna(0),
        dfvafeatures, on="pat_deid", how="outer"),
         dftfeatures, on="pat_deid", how="outer"), 
        dfcctfeatures, on="pat_deid", how="outer"), 
        dfrxfeatures, on="pat_deid", how="outer"),
         dfcdrfeatures, on="pat_deid", how="outer")

#joined the boolean variables first, as they can have fillna(0) without changing the meaning
#we will fill/impute the numeric variables but also create a missing values indicator below for those 

In [None]:
import math 
def missingindicator(x): 
    if math.isnan(x): 
        return 1 
    else: 
        return 0 

In [None]:
missingcols=[ 'bcvalogmarodbest',
 'bcvalogmarodworst',
 'bcvalogmarodmed',
 'bcvalogmarodlast',
 'bcvalogmarosbest',
 'bcvalogmarosworst',
 'bcvalogmarosmed',
 'bcvalogmaroslast',
 'todlo',
 'todhi',
 'todmed',
 'todlast',
 'toslo',
 'toshi',
 'tosmed',
 'toslast',
 'cctodlast',
 'cctoslast',
 'rxodminus',
 'rxosminus',
 'cdrodbest',
 'cdrodworst',
 'cdrosbest',
 'cdrosworst']

In [None]:
for col in missingcols: 
    dfstructured[col+'missing']=dfstructured[col].apply(missingindicator)
dfstructured.columns 

In [None]:
#now fill the rest of the missing values, equivalent to mean imputation 
dfstructured=dfstructured.fillna(0)

In [None]:
dfstructured.isna().any()

In [None]:
dfstructured.to_csv('lowva-structured-02.csv', index=False)

In [None]:
conn.close()

In [None]:
list(dfstructured.columns)