In [1]:
import pandas as pd
import re 
import sqlite3
import numpy as np
import ast
import sklearn 
import math 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
conn = sqlite3.connect('lowva\lowva.db')
conn.text_factory = str
cur = conn.cursor()

In [3]:
dfcohort=pd.read_sql_query('''select pat_deid, lowvadate from outcome''', conn)
dfcohort["lowvadate"]=pd.to_datetime(dfcohort["lowvadate"])
dfcohort.head()

Unnamed: 0,pat_deid,lowvadate
0,1174,2009-09-17
1,1790,2012-02-14
2,2262,2012-05-15
3,2610,2009-11-19
4,2736,2016-07-11


# Numeric Variables (From Eye Exam)

## Visual Acuity - done

In [4]:
dfexam=pd.read_sql_query('''select pat_deid, exam_date, bcvalogmarod, bcvalogmaros from examva
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam.head()
len(dfexam)

Unnamed: 0,pat_deid,exam_date,bcvalogmarod,bcvalogmaros
0,1174,2009-09-17,0.39794,0.69897
1,1790,2012-02-14,0.39794,0.39794
2,2262,2012-05-10,0.30103,0.176091
3,2262,2012-05-15,0.477121,0.39794
4,2610,2009-11-19,0.69897,0.544068


26070

In [5]:
#what we want is a dataframe where for every pat_deid, lowvadate
#we have a long list of preindex bcvalogmars for the appropriate eye 
dfexam=pd.merge(dfexam, dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)
#normalize first 
dfexam["bcvalogmarod"]=(dfexam["bcvalogmarod"]-dfexam["bcvalogmarod"].mean())/dfexam["bcvalogmarod"].std()
dfexam["bcvalogmaros"]=(dfexam["bcvalogmaros"]-dfexam["bcvalogmaros"].mean())/dfexam["bcvalogmaros"].std()

dfexam.head(20)

Unnamed: 0,pat_deid,exam_date,bcvalogmarod,bcvalogmaros,lowvadate
0,1174,2009-09-17,-0.242049,0.137679,2009-09-17
1,1790,2012-02-14,-0.242049,-0.258972,2012-02-14
2,2262,2012-05-10,-0.369599,-0.55129,2012-05-15
3,2262,2012-05-15,-0.137834,-0.258972,2012-05-15
4,2610,2009-11-19,0.154154,-0.066427,2009-11-19
5,2736,2016-07-11,,-0.258972,2016-07-11
6,2736,2016-07-11,3.032712,-0.258972,2016-07-11
7,2736,2016-07-11,3.032712,-0.258972,2016-07-11
8,2920,2010-10-25,2.658927,2.645277,2010-10-25
9,3178,2012-10-24,-0.369599,-0.55129,2016-11-11


In [14]:
featurevariable="bcvalogmaros" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

def getmed(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: med = np.percentile(np.array(valuelistnonan), 50)
    except: med=np.nan
    return med 

def getrecent(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: recent=valuelistnonan[-1]     
    except: recent=np.nan 
    return recent 

In [11]:
gethi(3178,pd.to_datetime("2016-11-11"))
getlo(3178,pd.to_datetime("2016-11-11"))
getmed(3178,pd.to_datetime("2016-11-11"))
getrecent(3178,pd.to_datetime("2016-11-11"))


#getlo(1861,pd.to_datetime("2016-11-30"))
#getmed(1861,pd.to_datetime("2016-12-14"))
#getrecent(4659,pd.to_datetime("2016-12-07"))

-0.2589718183122771

-0.5512897811984893

-0.46897731147740096

-0.2589718183122771

In [13]:
dfvafeatures=dfcohort[["pat_deid", "lowvadate"]]

dfvafeatures["bcvalogmarodbest"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfvafeatures["bcvalogmarodworst"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dfvafeatures["bcvalogmarodmed"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dfvafeatures["bcvalogmarodlast"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)


In [15]:
#change featurevariable = bcvalogmaros and rerun the function definitions 

dfvafeatures["bcvalogmarosbest"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfvafeatures["bcvalogmarosworst"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dfvafeatures["bcvalogmarosmed"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dfvafeatures["bcvalogmaroslast"]=dfvafeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)


In [16]:
dfvafeatures.head(20)

Unnamed: 0,pat_deid,lowvadate,bcvalogmarodbest,bcvalogmarodworst,bcvalogmarodmed,bcvalogmarodlast,bcvalogmarosbest,bcvalogmarosworst,bcvalogmarosmed,bcvalogmaroslast
0,1174,2009-09-17,-0.242049,-0.242049,-0.242049,-0.242049,0.137679,0.137679,0.137679,0.137679
1,1790,2012-02-14,-0.242049,-0.242049,-0.242049,-0.242049,-0.258972,-0.258972,-0.258972,-0.258972
2,2262,2012-05-15,-0.369599,-0.137834,-0.253716,-0.137834,-0.55129,-0.258972,-0.405131,-0.258972
3,2610,2009-11-19,0.154154,0.154154,0.154154,0.154154,-0.066427,-0.066427,-0.066427,-0.066427
4,2736,2016-07-11,3.032712,3.032712,3.032712,3.032712,-0.258972,-0.258972,-0.258972,-0.258972
5,2920,2010-10-25,2.658927,2.658927,2.658927,2.658927,2.645277,2.645277,2.645277,2.645277
6,3178,2016-11-11,-0.534038,-0.137834,-0.369599,-0.242049,-0.55129,-0.258972,-0.468977,-0.258972
7,4062,2016-07-13,-0.765802,-0.137834,-0.638253,-0.137834,-0.783316,0.662023,-0.55129,-0.066427
8,4690,2017-08-07,0.782123,0.782123,0.782123,0.782123,0.137679,0.137679,0.137679,0.137679
9,5361,2013-01-11,0.026605,0.026605,0.026605,0.026605,0.009986,0.009986,0.009986,0.009986


In [17]:
dfcohort[dfcohort["pat_deid"]==3178]
dfexam[dfexam["pat_deid"]==3178]
dfvafeatures[dfvafeatures["pat_deid"]==3178]

Unnamed: 0,pat_deid,lowvadate
6,3178,2016-11-11


Unnamed: 0,pat_deid,exam_date,bcvalogmarod,bcvalogmaros,lowvadate
9,3178,2012-10-24,-0.369599,-0.55129,2016-11-11
10,3178,2012-11-30,-0.369599,-0.386665,2016-11-11
11,3178,2013-05-20,-0.534038,-0.386665,2016-11-11
12,3178,2013-05-20,-0.534038,-0.55129,2016-11-11
13,3178,2014-04-16,-0.534038,-0.55129,2016-11-11
14,3178,2016-05-16,-0.137834,-0.55129,2016-11-11
15,3178,2016-05-16,-0.242049,-0.386665,2016-11-11
16,3178,2016-11-11,-0.242049,-0.258972,2016-11-11


Unnamed: 0,pat_deid,lowvadate,bcvalogmarodbest,bcvalogmarodworst,bcvalogmarodmed,bcvalogmarodlast,bcvalogmarosbest,bcvalogmarosworst,bcvalogmarosmed,bcvalogmaroslast
6,3178,2016-11-11,-0.534038,-0.137834,-0.369599,-0.242049,-0.55129,-0.258972,-0.468977,-0.258972


## IOPs - done

In [18]:
dfexam=pd.read_sql_query('''select * from examiop''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam.head()
len(dfexam)

Unnamed: 0,pat_deid,tod,tos,tmethod,exam_date
0,1174,"[""14""]","[""15""]","[""Tonopen""]",2009-09-17
1,1174,,,,2009-09-17
2,1174,,,,2009-09-17
3,1174,,,,2009-09-17
4,1174,,,,2009-09-17


54880

In [19]:
dfexam=pd.merge(dfexam, dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True   )
dfexam=dfexam[dfexam["tmethod"]!="null"]
dfexam.head(20)

Unnamed: 0,pat_deid,tod,tos,tmethod,exam_date,lowvadate
0,1174,"[""14""]","[""15""]","[""Tonopen""]",2009-09-17,2009-09-17
5,1790,"[""20""]","[""20""]","[""Tonopen""]",2012-02-14,2012-02-14
6,2262,"[""13""]","[""13""]","[""Applanation""]",2012-05-10,2012-05-15
9,2610,"[""17""]",,"[""Tonopen""]",2009-11-19,2009-11-19
13,2736,"[""15""]","[""20""]","[""Tonopen""]",2016-07-11,2016-07-11
17,2920,"[""16""]","[""16""]","[""Applanation""]",2010-10-25,2010-10-25
18,3178,"[""11""]","[""10""]","[""Tonopen""]",2012-10-24,2016-11-11
26,3178,,,"[""Tonopen""]",2013-05-20,2016-11-11
28,3178,"[""14""]","[""13""]","[""Tonopen""]",2014-04-16,2016-11-11
30,3178,"[""17""]","[""17""]","[""Tonopen""]",2016-05-16,2016-11-11


In [20]:
def getmaxt(stringlist): 
    try: 
        tlist=ast.literal_eval(stringlist)
    except: 
        return np.nan 
    numlist=[] 
    for item in tlist: 
        try: 
            itemint=int(item)
            numlist.append(itemint)
        except: continue  
    try: 
        maxt=max(numlist)
    except:
        maxt=np.nan
    return maxt 

dfexam["todmax"]=dfexam["tod"].apply(getmaxt)
dfexam["tosmax"]=dfexam["tos"].apply(getmaxt)
dfexam.head(20)

Unnamed: 0,pat_deid,tod,tos,tmethod,exam_date,lowvadate,todmax,tosmax
0,1174,"[""14""]","[""15""]","[""Tonopen""]",2009-09-17,2009-09-17,14.0,15.0
5,1790,"[""20""]","[""20""]","[""Tonopen""]",2012-02-14,2012-02-14,20.0,20.0
6,2262,"[""13""]","[""13""]","[""Applanation""]",2012-05-10,2012-05-15,13.0,13.0
9,2610,"[""17""]",,"[""Tonopen""]",2009-11-19,2009-11-19,17.0,
13,2736,"[""15""]","[""20""]","[""Tonopen""]",2016-07-11,2016-07-11,15.0,20.0
17,2920,"[""16""]","[""16""]","[""Applanation""]",2010-10-25,2010-10-25,16.0,16.0
18,3178,"[""11""]","[""10""]","[""Tonopen""]",2012-10-24,2016-11-11,11.0,10.0
26,3178,,,"[""Tonopen""]",2013-05-20,2016-11-11,,
28,3178,"[""14""]","[""13""]","[""Tonopen""]",2014-04-16,2016-11-11,14.0,13.0
30,3178,"[""17""]","[""17""]","[""Tonopen""]",2016-05-16,2016-11-11,17.0,17.0


In [21]:
#normalize 
dfexam["todmax"]=(dfexam["todmax"]-dfexam["todmax"].mean())/dfexam["todmax"].std()
dfexam["tosmax"]=(dfexam["tosmax"]-dfexam["tosmax"].mean())/dfexam["tosmax"].std()

In [24]:
featurevariable="tosmax" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

def getmed(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: med = np.percentile(np.array(valuelistnonan), 50)
    except: med=np.nan
    return med 

def getrecent(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: recent=valuelistnonan[-1]     
    except: recent=np.nan 
    return recent 

In [23]:
dftfeatures=dfcohort[["pat_deid", "lowvadate"]]
dftfeatures["todlo"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dftfeatures["todhi"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dftfeatures["todmed"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dftfeatures["todlast"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)
dftfeatures.head()

Unnamed: 0,pat_deid,lowvadate,todlo,todhi,todmed,todlast
0,1174,2009-09-17,-0.26519,-0.26519,-0.26519,-0.26519
1,1790,2012-02-14,0.721405,0.721405,0.721405,0.721405
2,2262,2012-05-15,-0.429622,-0.429622,-0.429622,-0.429622
3,2610,2009-11-19,0.228107,0.228107,0.228107,0.228107
4,2736,2016-07-11,-0.100757,-0.100757,-0.100757,-0.100757


In [25]:
#reset featurevariable 
dftfeatures["toslo"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dftfeatures["toshi"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)
dftfeatures["tosmed"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getmed(*x), axis=1)
dftfeatures["toslast"]=dftfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)

In [26]:
dftfeatures.head(20)

Unnamed: 0,pat_deid,lowvadate,todlo,todhi,todmed,todlast,toslo,toshi,tosmed,toslast
0,1174,2009-09-17,-0.26519,-0.26519,-0.26519,-0.26519,-0.079814,-0.079814,-0.079814,-0.079814
1,1790,2012-02-14,0.721405,0.721405,0.721405,0.721405,0.552099,0.552099,0.552099,0.552099
2,2262,2012-05-15,-0.429622,-0.429622,-0.429622,-0.429622,-0.332579,-0.332579,-0.332579,-0.332579
3,2610,2009-11-19,0.228107,0.228107,0.228107,0.228107,,,,
4,2736,2016-07-11,-0.100757,-0.100757,-0.100757,-0.100757,0.552099,0.552099,0.552099,0.552099
5,2920,2010-10-25,0.063675,0.063675,0.063675,0.063675,0.046569,0.046569,0.046569,0.046569
6,3178,2016-11-11,-0.758487,0.39254,-0.018541,0.39254,-0.711727,0.299334,-0.079814,0.299334
7,4062,2016-07-13,-0.594054,0.556972,-0.26519,-0.26519,-0.711727,0.046569,-0.206197,-0.711727
8,4690,2017-08-07,-0.922919,-0.922919,-0.922919,-0.922919,-1.217258,-1.217258,-1.217258,-1.217258
9,5361,2013-01-11,,,,,,,,


## CCT - done

In [27]:
dfexam=pd.read_sql_query('''select * from examcct''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["cctdate"])
del dfexam["cctdate"]
#dfexam.head()
len(dfexam)

1177

In [28]:
dfexam=pd.merge(dfexam, dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)

#remove outliers - e.g., 26 is clearly a typo as cct is usually a few hundred microns 
dfexam["cctod"]=np.where(dfexam["cctod"]<300, np.nan, dfexam["cctod"])
dfexam["cctos"]=np.where(dfexam["cctos"]<300, np.nan, dfexam["cctos"])

#normalize 
dfexam["cctod"]=(dfexam["cctod"]-dfexam["cctod"].mean())/dfexam["cctod"].std()
dfexam["cctos"]=(dfexam["cctos"]-dfexam["cctos"].mean())/dfexam["cctos"].std()


dfexam.head(20)

Unnamed: 0,pat_deid,cctod,cctos,exam_date,lowvadate
0,4062,-0.484409,-0.841648,2016-07-12,2016-07-13
1,4690,0.084234,0.127421,2017-08-07,2017-08-07
2,11196,2.039814,0.441714,2015-05-29,2015-05-29
3,18210,0.680617,0.153612,2015-08-28,2015-12-15
4,48249,1.26313,0.035752,2015-11-19,2016-01-20
5,56175,-0.317977,-0.055916,2016-05-13,2017-01-13
6,59935,-1.316571,0.009561,2011-03-31,2011-06-02
7,63923,1.498909,1.188158,2016-12-02,2017-05-05
8,63923,1.498909,1.895317,2017-02-02,2017-05-05
9,63923,1.457301,1.489355,2017-02-14,2017-05-05


In [32]:
featurevariable="cctos" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

def getmed(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: med = np.percentile(np.array(valuelistnonan), 50)
    except: med=np.nan
    return med 

def getrecent(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: recent=valuelistnonan[-1]     
    except: recent=np.nan 
    return recent 

In [31]:
dfcctfeatures=dfcohort[["pat_deid", "lowvadate"]]
dfcctfeatures["cctodlast"]=dfcctfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)

In [33]:
#reset feature variable and rerun the functions 
dfcctfeatures["cctoslast"]=dfcctfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getrecent(*x), axis=1)

In [34]:
dfexam[dfexam["pat_deid"]==63923]

Unnamed: 0,pat_deid,cctod,cctos,exam_date,lowvadate
7,63923,1.498909,1.188158,2016-12-02,2017-05-05
8,63923,1.498909,1.895317,2017-02-02,2017-05-05
9,63923,1.457301,1.489355,2017-02-14,2017-05-05
10,63923,1.346346,1.842935,2017-02-28,2017-05-05
11,63923,1.26313,,2017-03-21,2017-05-05
12,63923,1.290868,4.802523,2017-03-28,2017-05-05
13,63923,1.318607,6.47875,2017-03-31,2017-05-05
14,63923,0.125842,1.567929,2017-04-18,2017-05-05
15,63923,1.207652,,2017-04-21,2017-05-05


In [35]:
dfcctfeatures.head(20)
len(dfcctfeatures)
#many are missing because this just wasnt measured 

Unnamed: 0,pat_deid,lowvadate,cctodlast,cctoslast
0,1174,2009-09-17,,
1,1790,2012-02-14,,
2,2262,2012-05-15,,
3,2610,2009-11-19,,
4,2736,2016-07-11,,
5,2920,2010-10-25,,
6,3178,2016-11-11,,
7,4062,2016-07-13,-0.484409,-0.841648
8,4690,2017-08-07,0.084234,0.127421
9,5361,2013-01-11,,


5612

## Refraction - done

In [36]:
dfexam=pd.read_sql_query('''select * from examrx''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam.head()
len(dfexam)

Unnamed: 0,pat_deid,exam_date,wrxodspheqv,wrxosspheqv,mrxodspheqv,mrxosspheqv,finalrxodspheqv,finalrxosspheqv
0,1174,2009-09-17,,,0.5,,,
1,1174,2009-09-17,,,-6.0025,,,
2,1174,2009-09-17,,,,-3.5,,
3,1790,2012-02-14,,,-0.125,0.125,,
4,2262,2012-05-10,1.5,1.125,1.375,2.0,0.25,0.125


11395

In [37]:
#this needs to be treated a bit diferently. We want the most myopic spherical equivalent per eye 
dfexam["spheqvod"]=dfexam[["wrxodspheqv", "mrxodspheqv", "finalrxodspheqv"]].min(axis=1)
dfexam["spheqvos"]=dfexam[["wrxosspheqv", "mrxosspheqv", "finalrxosspheqv"]].min(axis=1)
#dfexam.head()

In [38]:
dfexam=pd.merge(dfexam[["pat_deid", "exam_date", "spheqvod", "spheqvos"]], dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)
dfexam["spheqvod"]=(dfexam["spheqvod"]-dfexam["spheqvod"].mean())/dfexam["spheqvod"].std()
dfexam["spheqvos"]=(dfexam["spheqvos"]-dfexam["spheqvos"].mean())/dfexam["spheqvos"].std()

#dfexam.head(20)

In [41]:
featurevariable="spheqvos"
def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    try: lo=min(valuelist) 
    except: lo=np.nan 
    return lo 

In [40]:
dfrxfeatures=dfcohort[["pat_deid", "lowvadate"]]

dfrxfeatures["rxodminus"]=dfrxfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)

In [42]:
#reset featurevariable 
dfrxfeatures["rxosminus"]=dfrxfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)


In [43]:
dfrxfeatures.head(20)
len(dfrxfeatures)

Unnamed: 0,pat_deid,lowvadate,rxodminus,rxosminus
0,1174,2009-09-17,-0.623728,
1,1790,2012-02-14,0.194392,0.245315
2,2262,2012-05-15,0.24659,0.245315
3,2610,2009-11-19,,
4,2736,2016-07-11,,
5,2920,2010-10-25,,
6,3178,2016-11-11,,-0.055286
7,4062,2016-07-13,-0.066599,0.033126
8,4690,2017-08-07,,
9,5361,2013-01-11,-0.379788,-0.214427


5612

## CDR - done 

In [44]:
dfexam=pd.read_sql_query('''select pat_deid, exam_date, feodcdr, feoscdr from examcdr''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["exam_date"])
dfexam=dfexam[~((dfexam["feodcdr"].isnull()) & (dfexam["feoscdr"].isnull()))]
dfexam.head()
len(dfexam)

Unnamed: 0,pat_deid,exam_date,feodcdr,feoscdr
0,2262,2012-05-10,0.3,0.3
1,3178,2016-11-11,0.4,
2,4062,2010-05-19,,0.2
3,4062,2011-05-05,,0.2
4,10342,2010-06-29,0.2,0.2


3196

In [45]:
dfexam=pd.merge(dfexam[["pat_deid", "exam_date", "feodcdr", "feoscdr"]], dfcohort, on="pat_deid")
dfexam=dfexam[dfexam["exam_date"]<=dfexam["lowvadate"]]

dfexam.sort_values(["pat_deid","lowvadate", "exam_date"], inplace=True)

dfexam["feodcdr"]=(dfexam["feodcdr"]-dfexam["feodcdr"].mean())/dfexam["feodcdr"].std()
dfexam["feoscdr"]=(dfexam["feoscdr"]-dfexam["feoscdr"].mean())/dfexam["feoscdr"].std()


dfexam.head(20)

Unnamed: 0,pat_deid,exam_date,feodcdr,feoscdr,lowvadate
0,2262,2012-05-10,-0.497893,-0.493139,2012-05-15
1,3178,2016-11-11,-0.034513,,2016-11-11
2,4062,2010-05-19,,-0.957073,2016-07-13
3,4062,2011-05-05,,-0.957073,2016-07-13
4,10342,2010-06-29,-0.961273,-0.957073,2010-06-29
5,23699,2010-10-15,-0.497893,-0.029204,2016-02-19
6,23699,2012-02-28,-0.497893,-0.029204,2016-02-19
7,23699,2013-02-11,-0.497893,-0.029204,2016-02-19
8,23699,2013-09-24,-0.497893,-0.029204,2016-02-19
9,23699,2014-11-18,-0.497893,-0.029204,2016-02-19


In [46]:
len(dfexam["pat_deid"].unique())

1504

In [50]:
featurevariable="feoscdr" #change this depending on which variable we are searching over
def gethi(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: hi=max(valuelistnonan)
    except: hi = np.nan
    return hi 

def getlo(pat_deid, lowvadate, value=featurevariable): 
    valuelist=dfexam[(dfexam["pat_deid"]==pat_deid) & (dfexam["lowvadate"]==lowvadate)][value].tolist() 
    valuelistnonan=[x for x in valuelist if pd.notnull(x)]
    try: lo=min(valuelistnonan) 
    except: lo=np.nan 
    return lo 

In [49]:
dfcdrfeatures=dfcohort[["pat_deid", "lowvadate"]]
dfcdrfeatures["cdrodbest"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfcdrfeatures["cdrodworst"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)



In [51]:
#reset featurevariable 
dfcdrfeatures["cdrosbest"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: getlo(*x), axis=1)
dfcdrfeatures["cdrosworst"]=dfcdrfeatures[["pat_deid", "lowvadate"]].apply(lambda x: gethi(*x), axis=1)


In [52]:
dfcdrfeatures.head(20)
len(dfcdrfeatures)
#this field has a lot of missing data because not all providers use it - many free-text this finding into their notes 

Unnamed: 0,pat_deid,lowvadate,cdrodbest,cdrodworst,cdrosbest,cdrosworst
0,1174,2009-09-17,,,,
1,1790,2012-02-14,,,,
2,2262,2012-05-15,-0.497893,-0.497893,-0.493139,-0.493139
3,2610,2009-11-19,,,,
4,2736,2016-07-11,,,,
5,2920,2010-10-25,,,,
6,3178,2016-11-11,-0.034513,-0.034513,,
7,4062,2016-07-13,,,-0.957073,-0.957073
8,4690,2017-08-07,,,,
9,5361,2013-01-11,,,,


5612

In [53]:
len(dfcdrfeatures[~((dfcdrfeatures["cdrodbest"].isnull()) & 
              (dfcdrfeatures["cdrosbest"].isnull()) & 
              (dfcdrfeatures["cdrodworst"].isnull()) & 
              (dfcdrfeatures["cdrosworst"].isnull()))])

1504

## Combine the structured exam features into one matrix

In [44]:
#first we are going to get rid of all the redundant lowvadates 
del dfvafeatures["lowvadate"]
del dftfeatures["lowvadate"]
del dfcctfeatures["lowvadate"]
del dfrxfeatures["lowvadate"]
del dfcdrfeatures["lowvadate"]


dfexamstructured=pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(dfcohort[["pat_deid"]], dfvafeatures, on=["pat_deid"], how="outer"), 
         dftfeatures, on=["pat_deid"], how="outer"), 
        dfcctfeatures, on=["pat_deid"], how="outer"), 
                  dfrxfeatures, on=["pat_deid"], how="outer"),
         dfcdrfeatures, on=["pat_deid"], how="outer")

In [45]:
dfexamstructured.head()

Unnamed: 0,pat_deid,bcvalogmarodbest,bcvalogmarodworst,bcvalogmarodmed,bcvalogmarodlast,bcvalogmarosbest,bcvalogmarosworst,bcvalogmarosmed,bcvalogmaroslast,todlo,...,tosmed,toslast,cctodlast,cctoslast,rxodminus,rxosminus,cdrodbest,cdrodworst,cdrosbest,cdrosworst
0,1174,0.137679,0.137679,0.137679,0.137679,0.137679,0.137679,0.137679,0.137679,-0.079814,...,-0.079814,-0.079814,,,,,,,,
1,1790,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,0.552099,...,0.552099,0.552099,,,0.245315,0.245315,,,,
2,2262,-0.55129,-0.258972,-0.405131,-0.258972,-0.55129,-0.258972,-0.405131,-0.258972,-0.332579,...,-0.332579,-0.332579,,,0.245315,0.245315,-0.493139,-0.493139,-0.493139,-0.493139
3,2610,-0.066427,-0.066427,-0.066427,-0.066427,-0.066427,-0.066427,-0.066427,-0.066427,,...,,,,,,,,,,
4,2736,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,-0.258972,0.552099,...,0.552099,0.552099,,,,,,,,


In [46]:
dfexamstructured.dtypes

pat_deid               int64
bcvalogmarodbest     float64
bcvalogmarodworst    float64
bcvalogmarodmed      float64
bcvalogmarodlast     float64
bcvalogmarosbest     float64
bcvalogmarosworst    float64
bcvalogmarosmed      float64
bcvalogmaroslast     float64
todlo                float64
todhi                float64
todmed               float64
todlast              float64
toslo                float64
toshi                float64
tosmed               float64
toslast              float64
cctodlast            float64
cctoslast            float64
rxodminus            float64
rxosminus            float64
cdrodbest            float64
cdrodworst           float64
cdrosbest            float64
cdrosworst           float64
dtype: object

# Coded Variables 

## Medications - done

### Turn long dataframe to wide and filter out near zero variance features

In [54]:
dfmeds=pd.read_sql_query('''select * from medslong''', conn) 
dfmeds.columns = map(str.lower, dfmeds.columns)
dfmeds["rx_date"]=pd.to_datetime(dfmeds["rx_date"])
#dfmeds.head()
len(dfmeds)

138929

In [55]:
dfmeds=pd.merge(dfmeds,dfcohort[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid")
dfmeds.sort_values(by=["pat_deid", "rx_date"], ascending=True, inplace=True)
dfmeds=dfmeds[dfmeds["rx_date"]<=dfmeds["lowvadate"]]
#dfmeds.head()

In [56]:
dfmeds["pivotvalue"]=1
dfmeds["medication_id"]=dfmeds["medication_id"].astype(int)
dfmedswide=dfmeds.pivot_table(values="pivotvalue", index=['pat_deid'], columns='medication_id', fill_value=0)
dfmedswide.columns = ['med_'+str(col) for col in dfmedswide.columns.values]
dfmedswide.head()

Unnamed: 0_level_0,med_1,med_2,med_51,med_62,med_84,med_85,med_87,med_98,med_100,med_101,...,med_542002,med_550003,med_550007,med_550008,med_550009,med_550011,med_550012,med_550013,med_575029,med_590201
pat_deid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
dfmedswide.shape

(5132, 7785)

In [58]:
#let's filter out near zero variance features for the medications
from sklearn.feature_selection import VarianceThreshold
selector=VarianceThreshold(.99 * (1 - .99))

selector.fit_transform(np.array(dfmedswide.loc[:, 'med_1':'med_590201'])).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dfmedsfiltered=variance_threshold_selector(dfmedswide.loc[:, 'med_1':'med_590201'], .99 * (1 - .99))

(5132, 361)

In [59]:
dfmedsfiltered.reset_index(inplace=True)

In [60]:
dfmedsfiltered.head()

Unnamed: 0,pat_deid,med_98,med_101,med_102,med_113,med_310,med_367,med_368,med_435,med_451,...,med_225803,med_232553,med_540147,med_540151,med_540507,med_540523,med_540557,med_540574,med_540619,med_540894
0,1174,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1790,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2262,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2610,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2736,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Diagnoses - done

In [61]:
dfdx=pd.read_sql_query('''select * from dxlong''', conn) 
dfdx.columns = map(str.lower, dfdx.columns)
dfdx["dx_date"]=pd.to_datetime(dfdx["dx_date"])
dfdx.head()
len(dfdx)

Unnamed: 0,pat_deid,dx_date,icd9_list,icd10_list,pivotvalue
0,1174,2009-09-17,362.56,H35.379,1
1,1174,2009-09-17,379.21,H43.819,1
2,1174,2009-09-17,"250.50, 362.02",E11.3599,1
3,1174,2009-09-17,V43.1,Z96.1,1
4,1174,2009-09-17,371.50,H18.50,1


180095

In [62]:
#one of the issues is that sometimes icd9 is missing, and sometimes icd10 is missing. Let's create a combined column 
dfdx["icd9_list"]="icd9_"+dfdx["icd9_list"].astype(str)
dfdx["icd10_list"]="icd10_"+dfdx["icd10_list"].astype(str)
dfdx["icd"]=np.where(dfdx["icd10_list"]=="icd10_None", dfdx['icd9_list'], dfdx["icd10_list"])

In [63]:
dfdx.head()

Unnamed: 0,pat_deid,dx_date,icd9_list,icd10_list,pivotvalue,icd
0,1174,2009-09-17,icd9_362.56,icd10_H35.379,1,icd10_H35.379
1,1174,2009-09-17,icd9_379.21,icd10_H43.819,1,icd10_H43.819
2,1174,2009-09-17,"icd9_250.50, 362.02",icd10_E11.3599,1,icd10_E11.3599
3,1174,2009-09-17,icd9_V43.1,icd10_Z96.1,1,icd10_Z96.1
4,1174,2009-09-17,icd9_371.50,icd10_H18.50,1,icd10_H18.50


In [64]:
dfdxwide=dfdx.pivot_table(values="pivotvalue", index=['pat_deid'], columns='icd', fill_value=0)
dfdxwide.head()

icd,icd10_A04.72,icd10_A15.0,icd10_A15.8,icd10_A15.9,icd10_A31.8,icd10_A41.9,icd10_A49.9,icd10_A53.0,icd10_A60.00,icd10_A65,...,icd9_995.3,icd9_996.52,icd9_E947.9,icd9_IMO0001,icd9_V19.1,icd9_V42.5,icd9_V42.81,icd9_V58.32,icd9_V68.89,icd9_V76.12
pat_deid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
selector=VarianceThreshold(.99 * (1 - .99))

selector.fit_transform(np.array(dfdxwide)).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dfdxfiltered=variance_threshold_selector(dfdxwide, .99 * (1 - .99))

(5547, 129)

In [66]:
dfdxfiltered.reset_index(inplace=True)

## Surgeries

In [67]:
dfcpt=pd.read_sql_query('''select * from cpt''', conn) 
dfcpt.columns = map(str.lower, dfcpt.columns)
dfcpt=pd.merge(dfcohort["pat_deid"], dfcpt, on="pat_deid", how="left").fillna(0)
dfcpt.set_index("pat_deid", inplace=True)
dfcpt.head()
len(dfcpt)

Unnamed: 0_level_0,cpt_65091,cpt_65093,cpt_65105,cpt_65175,cpt_65205,cpt_65210,cpt_65220,cpt_65222,cpt_65235,cpt_65265,...,cpt_68760,cpt_68761,cpt_68801,cpt_68810,cpt_68815,cpt_68840,cpt_0191t,cpt_0192t,cpt_0449t,cpt_0474t
pat_deid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


5612

In [68]:
selector=VarianceThreshold(.99 * (1 - .99))

selector.fit_transform(np.array(dfcpt)).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dfcptfiltered=variance_threshold_selector(dfcpt, .99 * (1 - .99))

(5612, 10)

In [69]:
dfcptfiltered.reset_index(inplace=True)

## Demographics

dfpt=pd.read_sql_query('''select d.pat_deid, o.lowvadate - d.birth_date as age, d.gender as gender_Female, d.race as race_asian, d.race as race_white, d.race as race_black, d.race as race_Pacific_Islander,d.race as race_Native_American, d.race as race_other, d.race as race_unknown, d.ethnicity as Ethnicity_Non_Hispanic,d.ethnicity as Ethnicity_Hispanic from demographics as d, outcome as o where d.pat_deid = o.pat_deid
''', conn)
dfpt.columns = map(str.lower, dfpt.columns)
dfpt['gender_female'] = (dfpt['gender_female'] == 'Female').astype(int)
dfpt['race_asian'] = (dfpt['race_asian'] == 'Asian').astype(int)
dfpt['race_white'] = (dfpt['race_white'] == 'White').astype(int)
dfpt['race_black'] = (dfpt['race_black'] == 'Black').astype(int)
dfpt['race_pacific_islander'] = (dfpt['race_pacific_islander'] == 'Pacific Islander').astype(int)
dfpt['race_native_american'] = (dfpt['race_native_american'] == 'Native_American').astype(int)
dfpt['race_other'] = (dfpt['race_other'] == 'Other').astype(int)
dfpt['race_unknown'] = (dfpt['race_unknown'] == 'Unknown').astype(int)
dfpt['ethnicity_non_hispanic'] = (dfpt['ethnicity_non_hispanic'] == 'Non-Hispanic').astype(int)
dfpt['ethnicity_hispanic'] = (dfpt['ethnicity_hispanic'] == 'Hispanic/Latino').astype(int)
#dfpt['ethnicity_unkown'] = (dfpt['ethnicity_unknown'] == 'Unknown').astype(int)

dfpt.head()

dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
from datetime import timedelta, date
future = dfpt['birth_date'] > date(year=2010,month=1,day=1) #specifies the cutoff year
dfpt.loc[future, 'birth_date'] -= timedelta(days=365.25*100)
dfpt.head()

dfpt["age"].mean()
dfpt["age"].std()

#normalize age
dfpt["agestandard"]=(dfpt["age"]-dfpt["age"].mean())/dfpt["age"].std()
#check and make sure it worked 
dfpt["agestandard"].mean()
dfpt["agestandard"].std()

dfpt=pd.read_sql_query('''select demographics.pat_deid, birth_date, gender, race, ethnicity 
from demographics''',conn)

dfpt["raceth"]=np.where(dfpt["ethnicity"]=="Hispanic/Latino", "Hispanic", dfpt["race"])
dfpt.loc[dfpt.raceth == "Pacific Islander", 'raceth'] = "Asian"
dfpt.loc[dfpt.raceth == "Native American", 'raceth'] = "Other"
dfpt.loc[dfpt.raceth == "Unknown", 'raceth'] = "Other"

dfpt["raceth"].value_counts()
dfpt["raceth"].value_counts()/5612
dfpt["gender"].value_counts()

In [82]:

dfpt=pd.read_sql_query('''select demographics.pat_deid, birth_date, gender, race, ethnicity, lowvadate from demographics, outcome where outcome.pat_deid = demographics.pat_deid''',conn)
dfpt.columns = map(str.lower, dfpt.columns)
dfpt["lowvadate"]=pd.to_datetime(dfpt["lowvadate"])
dfpt.head()
dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
from datetime import timedelta, date
def fix_date(x):
    if x.year >=2000:
        year = x.year - 100
    else:
        year = x.year
    return date(year,x.month,x.day)

dfpt['birth_date'] = dfpt['birth_date'].apply(fix_date)
dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
dfpt.head()
dfpt["raceth"]=np.where(dfpt["ethnicity"]=="Hispanic/Latino", "Hispanic", dfpt["race"])
dfpt["raceth"].value_counts()
dfpt.loc[dfpt.raceth == "Pacific Islander", 'raceth'] = "Asian"
dfpt.loc[dfpt.raceth == "Native American", 'raceth'] = "Other"
dfpt.loc[dfpt.raceth == "Unknown", 'raceth'] = "Other"
dfpt['age']=dfpt["lowvadate"].dt.year-dfpt["birth_date"].dt.year
dfpt["raceth"].value_counts()
dfpt.head()
dfpt.dtypes


Unnamed: 0,pat_deid,birth_date,gender,race,ethnicity,lowvadate
0,1174,1934-04-26 00:00:00,Male,Asian,Non-Hispanic,2009-09-17
1,1790,1935-07-01 00:00:00,Male,White,Non-Hispanic,2012-02-14
2,2262,1955-07-10 00:00:00,Male,Other,Hispanic/Latino,2012-05-15
3,2610,1972-05-13 00:00:00,Female,Asian,Non-Hispanic,2009-11-19
4,2736,1986-09-29 00:00:00,Male,Other,Non-Hispanic,2016-07-11


Unnamed: 0,pat_deid,birth_date,gender,race,ethnicity,lowvadate
0,1174,1934-04-26,Male,Asian,Non-Hispanic,2009-09-17
1,1790,1935-07-01,Male,White,Non-Hispanic,2012-02-14
2,2262,1955-07-10,Male,Other,Hispanic/Latino,2012-05-15
3,2610,1972-05-13,Female,Asian,Non-Hispanic,2009-11-19
4,2736,1986-09-29,Male,Other,Non-Hispanic,2016-07-11


White               2335
Asian               1207
Hispanic             988
Other                579
Black                222
Unknown              214
Pacific Islander      63
Native American        4
Name: raceth, dtype: int64

White       2335
Asian       1270
Hispanic     988
Other        797
Black        222
Name: raceth, dtype: int64

Unnamed: 0,pat_deid,birth_date,gender,race,ethnicity,lowvadate,raceth,age
0,1174,1934-04-26,Male,Asian,Non-Hispanic,2009-09-17,Asian,75
1,1790,1935-07-01,Male,White,Non-Hispanic,2012-02-14,White,77
2,2262,1955-07-10,Male,Other,Hispanic/Latino,2012-05-15,Hispanic,57
3,2610,1972-05-13,Female,Asian,Non-Hispanic,2009-11-19,Asian,37
4,2736,1986-09-29,Male,Other,Non-Hispanic,2016-07-11,Other,30


pat_deid               int64
birth_date    datetime64[ns]
gender                object
race                  object
ethnicity             object
lowvadate     datetime64[ns]
raceth                object
age                    int64
dtype: object

In [83]:
#standardize age
dfpt["agestandard"]=(dfpt["age"]-dfpt["age"].mean())/dfpt["age"].std()
#check and make sure it worked 
dfpt["agestandard"].mean()
dfpt["agestandard"].std()

del dfpt["race"]
del dfpt["ethnicity"]

dfpt=pd.get_dummies(dfpt)

dfpt.head()

#clean up original variables by deleting them 
del dfpt["birth_date"]
del dfpt["age"]
del dfpt["lowvadate"]

5.907209464595006e-17

0.999999999999997

Unnamed: 0,pat_deid,birth_date,lowvadate,age,agestandard,gender_Female,gender_Male,raceth_Asian,raceth_Black,raceth_Hispanic,raceth_Other,raceth_White
0,1174,1934-04-26,2009-09-17,75,0.382059,0,1,1,0,0,0,0
1,1790,1935-07-01,2012-02-14,77,0.479912,0,1,0,0,0,0,1
2,2262,1955-07-10,2012-05-15,57,-0.498621,0,1,0,0,1,0,0
3,2610,1972-05-13,2009-11-19,37,-1.477155,1,0,1,0,0,0,0
4,2736,1986-09-29,2016-07-11,30,-1.819641,0,1,0,0,0,1,0


# Final merge of standardized structured exam features with demographics, and medications,  diagnoses,and surgeries (nzv filtered)

In [84]:
dfoutcome=pd.read_sql_query('''select pat_deid, outcome from outcome''', conn)

In [85]:
dfstructured=pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(dfoutcome, dfpt, on="pat_deid", how="outer"),
         dfmedsfiltered, on="pat_deid", how="outer").fillna(0),
        dfdxfiltered,on="pat_deid", how="outer").fillna(0),
        dfcptfiltered,on="pat_deid", how="outer").fillna(0),
        dfvafeatures, on="pat_deid", how="outer"),
         dftfeatures, on="pat_deid", how="outer"), 
        dfcctfeatures, on="pat_deid", how="outer"), 
        dfrxfeatures, on="pat_deid", how="outer"),
         dfcdrfeatures, on="pat_deid", how="outer")

#joined the boolean variables first, as they can have fillna(0) without changing the meaning
#we will fill/impute the numeric variables but also create a missing values indicator below for those 

In [86]:
import math 
def missingindicator(x): 
    if math.isnan(x): 
        return 1 
    else: 
        return 0 

In [87]:
missingcols=[ 'bcvalogmarodbest',
 'bcvalogmarodworst',
 'bcvalogmarodmed',
 'bcvalogmarodlast',
 'bcvalogmarosbest',
 'bcvalogmarosworst',
 'bcvalogmarosmed',
 'bcvalogmaroslast',
 'todlo',
 'todhi',
 'todmed',
 'todlast',
 'toslo',
 'toshi',
 'tosmed',
 'toslast',
 'cctodlast',
 'cctoslast',
 'rxodminus',
 'rxosminus',
 'cdrodbest',
 'cdrodworst',
 'cdrosbest',
 'cdrosworst']

In [88]:
for col in missingcols: 
    dfstructured[col+'missing']=dfstructured[col].apply(missingindicator)
dfstructured.columns 

Index(['pat_deid', 'outcome', 'agestandard', 'gender_Female', 'gender_Male',
       'raceth_Asian', 'raceth_Black', 'raceth_Hispanic', 'raceth_Other',
       'raceth_White',
       ...
       'tosmedmissing', 'toslastmissing', 'cctodlastmissing',
       'cctoslastmissing', 'rxodminusmissing', 'rxosminusmissing',
       'cdrodbestmissing', 'cdrodworstmissing', 'cdrosbestmissing',
       'cdrosworstmissing'],
      dtype='object', length=563)

In [89]:
#now fill the rest of the missing values, equivalent to mean imputation 
dfstructured=dfstructured.fillna(0)

In [90]:
dfstructured.isna().any()

pat_deid             False
outcome              False
agestandard          False
gender_Female        False
gender_Male          False
                     ...  
rxosminusmissing     False
cdrodbestmissing     False
cdrodworstmissing    False
cdrosbestmissing     False
cdrosworstmissing    False
Length: 563, dtype: bool

In [91]:
dfstructured.to_csv('lowva-structured-02.csv', index=False)

In [86]:
conn.close()

In [92]:
list(dfstructured.columns)

['pat_deid',
 'outcome',
 'agestandard',
 'gender_Female',
 'gender_Male',
 'raceth_Asian',
 'raceth_Black',
 'raceth_Hispanic',
 'raceth_Other',
 'raceth_White',
 'med_98',
 'med_101',
 'med_102',
 'med_113',
 'med_310',
 'med_367',
 'med_368',
 'med_435',
 'med_451',
 'med_680',
 'med_681',
 'med_717',
 'med_718',
 'med_736',
 'med_856',
 'med_860',
 'med_988',
 'med_1080',
 'med_1300',
 'med_1755',
 'med_1767',
 'med_1821',
 'med_2007',
 'med_2017',
 'med_2291',
 'med_2405',
 'med_2444',
 'med_2566',
 'med_2567',
 'med_2623',
 'med_2888',
 'med_3074',
 'med_3189',
 'med_3208',
 'med_3233',
 'med_3294',
 'med_3295',
 'med_3489',
 'med_3700',
 'med_3720',
 'med_3772',
 'med_3774',
 'med_3841',
 'med_3844',
 'med_3845',
 'med_4206',
 'med_4363',
 'med_4364',
 'med_4420',
 'med_4421',
 'med_4422',
 'med_4423',
 'med_4448',
 'med_4526',
 'med_4572',
 'med_4573',
 'med_4973',
 'med_5005',
 'med_5009',
 'med_5016',
 'med_5393',
 'med_5604',
 'med_5678',
 'med_5680',
 'med_5751',
 'med_5938