In [1]:
import pandas as pd
import re 
import sqlite3
import numpy as np
import ast
import sklearn 
import math 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
conn = sqlite3.connect('dbstridefull.db')
conn.text_factory = str
cur = conn.cursor()

# Find Low Vision Patients, and Define Outcome

In [3]:
dfexam=pd.read_sql_query('''select pat_deid, DATE_OF_SERVICE, 
vaoddistcc, vaoddistsc, vaoddistccph, vaoddistscph, vaosdistcc, vaosdistsc, vaosdistccph, vaosdistscph 
from examfield, examparsed 
where examfield.smartformid = examparsed.smartformid 
and not (vaoddistcc is null and vaoddistsc is null and vaosdistcc is null and vaosdistsc is null)
            order by pat_deid 
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["date_of_service"])
del dfexam["date_of_service"]
len(dfexam)
dfexam.head()

552184

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date
0,63,20/40,,NI,,20/60,,20/50,,2018-09-05
1,63,20/60,,20/50,,20/70,,20/60,,2018-10-17
2,200,20/25,,,,20/20,,,,2013-11-04
3,200,20/20,,,,20/20,,,,2013-08-26
4,200,20/50,,20/30,,20/20,,,,2013-09-06


In [4]:
def logmarconversion(va): 
    #takes a string input in the form of "20/20", or "cf" "hm" etc and spits out a logmar 
    if len(re.findall('(?i)cf', va))>0: 
        logmarva=-np.log10(.0025)
    elif len(re.findall('(?i)hm', va))>0: 
        logmarva=-np.log10(.002)
    elif len(re.findall('(?i)nlp', va))>0:
        logmarva=-np.log10(0.0013)
    elif len(re.findall('(?i)lp', va))>0: 
        logmarva=-np.log10(0.0016)
    elif len(re.findall('(?i)20/1600', va))>0: 
        logmarva=-np.log10(20/1600)
    elif len(re.findall('(?i)20/1250', va))>0: 
        logmarva=-np.log10(20/1250)
    elif len(re.findall('(?i)20/1000', va))>0: 
        logmarva=-np.log10(20/1000)
    elif len(re.findall('(?i)20/800', va))>0: 
        logmarva=-np.log10(20/800)
    elif len(re.findall('(?i)20/650', va))>0: 
        logmarva=-np.log10(20/650)
    elif len(re.findall('(?i)20/500', va))>0: 
        logmarva=-np.log10(20/500)
    elif len(re.findall('(?i)20/400', va))>0: 
        logmarva=-np.log10(20/400)
    elif len(re.findall('(?i)20/350', va))>0: 
        logmarva=-np.log10(20/350)
    elif len(re.findall('(?i)20/300', va))>0: 
        logmarva=-np.log10(20/300)
    elif len(re.findall('(?i)20/250', va))>0: 
        logmarva=-np.log10(20/250)
    elif len(re.findall('(?i)20/225', va))>0: 
        logmarva=-np.log10(20/225)
    elif len(re.findall('(?i)20/200', va))>0: 
        logmarva=-np.log10(20/200)
    elif len(re.findall('(?i)20/160', va))>0: 
        logmarva=-np.log10(20/160)
    elif len(re.findall('(?i)20/150', va))>0: 
        logmarva=-np.log10(20/150)
    elif len(re.findall('(?i)20/125', va))>0: 
        logmarva=-np.log10(20/125)
    elif len(re.findall('(?i)20/120', va))>0: 
        logmarva=-np.log10(20/120)
    elif len(re.findall('(?i)20/100', va))>0: 
        logmarva=-np.log10(20/100)
    elif len(re.findall('(?i)20/80', va))>0: 
        logmarva=-np.log10(20/80)
    elif len(re.findall('(?i)20/70', va))>0: 
        logmarva=-np.log10(20/70)
    elif len(re.findall('(?i)20/63', va))>0: 
        logmarva=-np.log10(20/63)
    elif len(re.findall('(?i)20/60', va))>0: 
        logmarva=-np.log10(20/60)
    elif len(re.findall('(?i)20/50', va))>0: 
        logmarva=-np.log10(20/50)
    elif len(re.findall('(?i)20/40', va))>0: 
        logmarva=-np.log10(20/40)
    elif len(re.findall('(?i)20/32', va))>0: 
        logmarva=-np.log10(20/32)
    elif len(re.findall('(?i)20/30', va))>0: 
        logmarva=-np.log10(20/30)
    elif len(re.findall('(?i)20/25', va))>0: 
        logmarva=-np.log10(20/25)
    elif len(re.findall('(?i)20/20', va))>0: 
        logmarva=-np.log10(20/20)
    elif len(re.findall('(?i)20/16', va))>0: 
        logmarva=-np.log10(20/16)
    elif len(re.findall('(?i)20/15', va))>0: 
        logmarva=-np.log10(20/15)
    elif len(re.findall('(?i)20/10', va))>0: 
        logmarva=-np.log10(20/10)
        
    else: logmarva=np.nan 
    return logmarva 

#now write a function which will take several va inputs and output the bcva logmar 
def bcvalogmar(vadistsc, vadistcc, vadistscph, vadistccph): 
    valist=list(filter(None.__ne__, [vadistsc, vadistcc, vadistscph, vadistccph])) #filter's out whichever are None
    logmarlist=[] 
    for va in valist: 
        if np.isnan(logmarconversion(va)) == False: 
            logmarlist.append(logmarconversion(va)) 
    try: bcvalogmar=min(logmarlist)
    except ValueError: bcvalogmar=np.nan #if no va's were recorded and all None input then this is an empty list with no minum
    return bcvalogmar 

In [5]:
dfva=dfexam
dfva["bcvalogmarod"]=dfva[["vaoddistsc", "vaoddistcc", "vaoddistscph", "vaoddistccph"]].apply(lambda x: bcvalogmar(*x), axis=1)
dfva["bcvalogmaros"]=dfva[["vaosdistsc", "vaosdistcc", "vaosdistscph", "vaosdistccph"]].apply(lambda x: bcvalogmar(*x), axis=1)

In [6]:
#there are some cases wehre vision was measured only in one eye. In this case we should do a last values carried forward for the patient 
dfva.sort_values(["pat_deid", "exam_date"], inplace=True)

In [7]:
dfva["bcvalogmarod"]=dfva[["pat_deid","bcvalogmarod"]].groupby(["pat_deid"]).ffill()["bcvalogmarod"]
dfva["bcvalogmaros"]=dfva[["pat_deid","bcvalogmaros"]].groupby(["pat_deid"]).ffill()["bcvalogmaros"]

In [8]:
dfva.head(20)

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,bcvalogmarod,bcvalogmaros
0,63,20/40,,NI,,20/60,,20/50,,2018-09-05,0.30103,0.39794
1,63,20/60,,20/50,,20/70,,20/60,,2018-10-17,0.39794,0.477121
3,200,20/20,,,,20/20,,,,2013-08-26,-0.0,-0.0
4,200,20/50,,20/30,,20/20,,,,2013-09-06,0.176091,-0.0
5,200,20/20,,,,20/20,,,,2013-09-19,-0.0,-0.0
6,200,20/25,,,,20/20,,,,2013-10-04,0.09691,-0.0
2,200,20/25,,,,20/20,,,,2013-11-04,0.09691,-0.0
12,215,20/20,,,,20/20,,,,2012-01-06,-0.0,-0.0
8,215,20/15,,,,20/15,,,,2013-02-04,-0.124939,-0.124939
9,215,20/20,,,,20/15,,,,2013-08-26,-0.0,-0.124939


In [9]:
def bcva(bcvalogmarod, bcvalogmaros): 
    if np.isnan(bcvalogmaros) == True: 
        return bcvalogmarod
    if np.isnan(bcvalogmarod) == True: 
        return bcvalogmaros
    if bcvalogmarod <= bcvalogmaros: 
            return bcvalogmarod 
    else: 
        return bcvalogmaros 
bcva(0.1, 0.2)

0.1

In [10]:
dfva["bcvalogmar"]=dfva[["bcvalogmarod", "bcvalogmaros"]].apply(lambda x: bcva(*x), axis=1)

In [11]:
dfva.head()

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,bcvalogmarod,bcvalogmaros,bcvalogmar
0,63,20/40,,NI,,20/60,,20/50,,2018-09-05,0.30103,0.39794,0.30103
1,63,20/60,,20/50,,20/70,,20/60,,2018-10-17,0.39794,0.477121,0.39794
3,200,20/20,,,,20/20,,,,2013-08-26,-0.0,-0.0,-0.0
4,200,20/50,,20/30,,20/20,,,,2013-09-06,0.176091,-0.0,-0.0
5,200,20/20,,,,20/20,,,,2013-09-19,-0.0,-0.0,-0.0


In [12]:
dfva[dfva["bcvalogmar"].isnull()]
#these guys with the missing visual acuities will end up getting dropped from our lowva cohort 

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,bcvalogmarod,bcvalogmaros,bcvalogmar
130,1282,,Fix and follow,,,,Fix and follow,,,2017-05-01,,,
651,4487,,BTL,,,,BTL,,,2015-09-21,,,
958,6240,,Fix and follow,,,,Fix and follow,,,2016-12-07,,,
959,6240,,Fix and follow,,,,Fix and follow,,,2017-01-03,,,
960,6240,,Fix and follow,,,,Fix and follow,,,2018-01-24,,,
1326,8851,,,,,,,,,2014-09-19,,,
2102,13481,,RTL,,,,RTL,,,2014-04-08,,,
3755,23877,,BTL,,,,BTL,,,2018-11-02,,,
4107,26476,,"cannot evaluate, pt does not respond appropria...",,,,"cannot evaluate, pt does not respond appropria...",,,2010-11-12,,,
4130,27071,,Follows,,,,?,,,2009-06-25,,,


In [13]:
dfva["bcvalt40"]=np.where(dfva["bcvalogmar"]>0.30103,1,0)
dfva["bcvalt200"]=np.where(dfva["bcvalogmar"]>1,1,0)

In [14]:
dfva.head()

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,bcvalogmarod,bcvalogmaros,bcvalogmar,bcvalt40,bcvalt200
0,63,20/40,,NI,,20/60,,20/50,,2018-09-05,0.30103,0.39794,0.30103,0,0
1,63,20/60,,20/50,,20/70,,20/60,,2018-10-17,0.39794,0.477121,0.39794,1,0
3,200,20/20,,,,20/20,,,,2013-08-26,-0.0,-0.0,-0.0,0,0
4,200,20/50,,20/30,,20/20,,,,2013-09-06,0.176091,-0.0,-0.0,0,0
5,200,20/20,,,,20/20,,,,2013-09-19,-0.0,-0.0,-0.0,0,0


In [66]:
dfva["bcvalt40"].mean()
dfva["bcvalt200"].mean()

0.14125545108152354

0.025147414629905974

In [67]:
len(dfva["pat_deid"].drop_duplicates())

86169

In [68]:
len(dfva[dfva["bcvalt40"]==1]["pat_deid"].drop_duplicates())
len(dfva[dfva["bcvalt200"]==1]["pat_deid"].drop_duplicates())

13847

3203

In [126]:
len(dfva[["pat_deid", "exam_date", "bcvalogmar"]].drop_duplicates())

502526

In [131]:
#we have one more problem which is that patients could have more than one smartform for vision per day (i.e., hvf etc.) 
#and they might have slightly different bcva's depending on how it was measured 
#so we need to take the best one of the bcvalogmars (lowest number) and only keep that one 
dfva=dfva.sort_values(["pat_deid", "exam_date", "bcvalogmar"])
dfva=dfva.drop_duplicates(["pat_deid", "exam_date"])
len(dfva)

489077

In [132]:
lowvafirstdate=dfva[dfva["bcvalt40"]==1].sort_values(["pat_deid", "exam_date"])[["pat_deid", "exam_date", "bcvalt40"]].drop_duplicates(["pat_deid"])


In [133]:
lowvafirstdate = lowvafirstdate.rename(columns = {
    'exam_date':'lowvadate'
})


In [134]:
lowvafirstdate.head()

Unnamed: 0,pat_deid,lowvadate,bcvalt40
1,63,2018-10-17,1
29,310,2017-03-23,1
112,1174,2009-09-17,1
133,1440,2011-06-16,1
161,1790,2012-02-14,1


In [135]:
lowvacohort=pd.merge(dfva[["pat_deid","exam_date","bcvalogmar", "bcvalt40"]], lowvafirstdate[["pat_deid", "lowvadate"]], on="pat_deid", how="right")
len(lowvacohort) 

In [136]:
#now we need those with at least one year of follow-up after the low vision date. 
lowvacohort["fudays"]=lowvacohort["exam_date"]-lowvacohort["lowvadate"]
len(lowvacohort[lowvacohort["fudays"]>=pd.Timedelta(days=365)]["pat_deid"].drop_duplicates()) #reduces cohort to 5612 

5612

In [140]:
cohortidlist=list(lowvacohort[lowvacohort["fudays"]>=pd.Timedelta(days=365)]["pat_deid"].drop_duplicates())
len(cohortidlist)

5612

In [141]:
lowvacohort=lowvacohort[lowvacohort["pat_deid"].isin(cohortidlist)]

In [142]:
lowvacohort.head()

Unnamed: 0,pat_deid,exam_date,bcvalogmar,bcvalt40,lowvadate,fudays
8,1174,2009-09-17,0.39794,1,2009-09-17,0 days
9,1174,2009-11-04,0.39794,1,2009-09-17,48 days
10,1174,2010-10-19,0.30103,0,2009-09-17,397 days
11,1174,2011-10-06,0.39794,1,2009-09-17,749 days
12,1174,2012-04-05,0.30103,0,2009-09-17,931 days


In [143]:
#now let's just determine how many folks had the outcome afterwards 
#we can define the outcome as bcvalt40 = 1 for all subsequent visits within 1 year 
#sum bcvalt40 by pat_deid and divide by count. if 1.0 that means all visits had bcvalt40==1 
outcome=lowvacohort[(lowvacohort["fudays"]<=pd.Timedelta(days=365)) & (lowvacohort["fudays"]>=pd.Timedelta(days=0))].groupby(["pat_deid"]).sum()/lowvacohort[(lowvacohort["fudays"]<=pd.Timedelta(days=365)) & (lowvacohort["fudays"]>=pd.Timedelta(days=0))].groupby(["pat_deid"]).count()
outcome.reset_index(inplace=True)
outcome=outcome[["pat_deid","bcvalt40"]]

In [144]:
len(outcome[outcome["bcvalt40"]==1])/len(outcome)

0.4059158945117605

In [145]:
#good! the set is not too imbalanced 
outcomeidlist=outcome[outcome["bcvalt40"]==1]["pat_deid"]
outcomeidlist

0          1174
3          2610
4          2736
5          2920
6          3178
8          4690
10         6895
11         6938
12         7478
13         9973
14        10342
21        15391
23        19167
25        19918
26        21102
30        22365
32        25358
34        28092
36        29059
38        29181
39        31157
40        31517
44        35265
47        39197
51        41747
53        44894
55        46374
58        48250
62        51597
63        53049
         ...   
5544    3804543
5548    3805547
5551    3807164
5553    3807765
5556    3808157
5558    3810091
5560    3811372
5561    3811618
5562    3812206
5563    3812950
5565    3813393
5567    3814217
5571    3816422
5573    3818763
5574    3819245
5578    3821277
5580    3821647
5583    3823881
5586    3825201
5587    3825324
5588    3825384
5592    3827448
5594    3828906
5595    3829355
5596    3830745
5597    3831209
5599    3833500
5602    3835986
5603    3836409
5607    3840336
Name: pat_deid, Length: 

In [104]:
A = np.random.randn(4,3)
B = np.sum(A, axis = 1, keepdims = True)

In [105]:
B.shape

(4, 1)

In [146]:
outcome["outcome"]=np.where(outcome["bcvalt40"]==1, 1, 0)
outcome

Unnamed: 0,pat_deid,bcvalt40,outcome
0,1174,1.000000,1
1,1790,0.333333,0
2,2262,0.250000,0
3,2610,1.000000,1
4,2736,1.000000,1
5,2920,1.000000,1
6,3178,1.000000,1
7,4062,0.500000,0
8,4690,1.000000,1
9,5361,0.500000,0


In [147]:
len(cohortidlist)

5612

In [None]:
outcome[["pat_deid","outcome"]]

In [149]:
lowvacohortsqltable=lowvacohort[lowvacohort["fudays"]==pd.Timedelta(days=0)][["pat_deid", "lowvadate", "bcvalogmar"]].drop_duplicates()

In [150]:
#export back to the full database 
lowvacohortsqltable.to_sql('lowvacohort', conn, if_exists='replace', index=False)

# Get Structured Data For Cohort

## Demographics

In [152]:
dfpt=pd.read_sql_query('''select pat_deid, birth_date, gender, race, ethnicity from patients 
where pat_deid in (select pat_deid
from lowvacohort) 
order by pat_deid''', conn)
dfpt.columns = map(str.lower, dfpt.columns)

dfpt["birth_date"]=pd.to_datetime(dfpt["birth_date"])
from datetime import timedelta, date
future = dfpt['birth_date'] > date(year=2010,month=1,day=1) #specifies the cutoff year
dfpt.loc[future, 'birth_date'] -= timedelta(days=365.25*100)
dfpt.head()
len(dfpt)

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  if __name__ == '__main__':


Unnamed: 0,pat_deid,birth_date,gender,race,ethnicity
0,1174,1934-04-26,Male,Asian,Non-Hispanic
1,1790,1935-07-01,Male,White,Non-Hispanic
2,2262,1955-07-10,Male,Other,Hispanic/Latino
3,2610,1972-05-13,Female,Asian,Non-Hispanic
4,2736,1986-09-29,Male,Other,Non-Hispanic


5612

## Diagnoses

In [3]:
#diagnoses 
dfdx=pd.read_sql_query('''select pat_deid, start_date, icd9_list, icd10_list from diagnoses where pat_deid in (select distinct pat_deid
from lowvacohort) 
order by pat_deid''', conn)
dfdx.columns = map(str.lower, dfdx.columns)

dfdx["dx_date"]=pd.to_datetime(dfdx["start_date"])

del dfdx["start_date"]

len(dfdx)
dfdx.head()

985770

Unnamed: 0,pat_deid,icd9_list,icd10_list,dx_date
0,1174,362.56,H35.379,2009-09-17
1,1174,"250.50, 362.02",E11.3599,2010-10-19
2,1174,362.07,,2013-07-12
3,1174,379.21,H43.819,2009-09-17
4,1174,371.50,H18.50,2014-03-24


In [6]:
lowvacohortsqltable=pd.read_sql_query('''select * from lowvacohort''', conn)
lowvacohortsqltable["lowvadate"]=pd.to_datetime(lowvacohortsqltable["lowvadate"])


In [7]:
dfdx=pd.merge(dfdx,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfdx.sort_values(by=["pat_deid", "dx_date"], ascending=True, inplace=True)
dfdx=dfdx[(dfdx["dx_date"]<=dfdx["lowvadate"])]
len(dfdx) 
dfdx.head()

180095

Unnamed: 0,pat_deid,icd9_list,icd10_list,dx_date,lowvadate
0,1174,362.56,H35.379,2009-09-17,2009-09-17
3,1174,379.21,H43.819,2009-09-17,2009-09-17
5,1174,"250.50, 362.02",E11.3599,2009-09-17,2009-09-17
9,1174,V43.1,Z96.1,2009-09-17,2009-09-17
19,1174,371.50,H18.50,2009-09-17,2009-09-17


In [10]:
dfdx["pivotvalue"]=1

dfdxwide=dfdx.pivot_table(values="pivotvalue", index='pat_deid', columns='icd10_list', fill_value=0)
dfdxwide.columns = ['icd_'+col for col in dfdxwide.columns.values]
dfdxwide.reset_index(inplace=True)
dfdxwide.head()

len(dfdxwide)

Unnamed: 0,pat_deid,icd_A04.72,icd_A15.0,icd_A15.8,icd_A15.9,icd_A31.8,icd_A41.9,icd_A49.9,icd_A53.0,icd_A60.00,...,icd_Z96.1,icd_Z97.0,icd_Z98.41,"icd_Z98.41, Z98.42",icd_Z98.42,icd_Z98.49,icd_Z98.83,icd_Z98.89,icd_Z98.890,icd_Z99.2
0,1174,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1790,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2262,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2610,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2736,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


5518

## Meds

In [156]:
#meds 
dfmeds=pd.read_sql_query('''select pat_deid, order_time, medication_id from meds where pat_deid in (select distinct pat_deid
from lowvacohort 
			) 
order by pat_deid''', conn)
dfmeds.columns = map(str.lower, dfmeds.columns)

dfmeds["rx_date"]=pd.to_datetime(dfmeds["order_time"])

del dfmeds["order_time"]

len(dfmeds)
dfmeds.head()

451062

Unnamed: 0,pat_deid,medication_id,rx_date
0,1174,181704,2009-09-17
1,1174,540151,2013-06-03
2,1174,180585,2012-08-06
3,1174,192027,2009-09-17
4,1174,10470,2011-01-14


In [157]:
dfmeds=pd.merge(dfmeds,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfmeds.sort_values(by=["pat_deid", "rx_date"], ascending=True, inplace=True)


#keep meds ordered before low vision date
dfmeds=dfmeds[(dfmeds["rx_date"]<=dfmeds["lowvadate"])]

dfmeds.head()
len(dfmeds)

Unnamed: 0,pat_deid,medication_id,rx_date,lowvadate
14,1174,186948.0,2009-05-18,2009-09-17
0,1174,181704.0,2009-09-17,2009-09-17
3,1174,192027.0,2009-09-17,2009-09-17
5,1174,177567.0,2009-09-17,2009-09-17
7,1174,174308.0,2009-09-17,2009-09-17


138929

In [158]:
dfmeds["pivotvalue"]=1
dfmeds["medication_id"]=dfmeds["medication_id"].astype(int)
dfmedswide=dfmeds.pivot_table(values="pivotvalue", index='pat_deid', columns='medication_id', fill_value=0)
dfmedswide.columns = ['med_'+str(col) for col in dfmedswide.columns.values]
dfmedswide.reset_index(inplace=True)
dfmedswide.head()
len(dfmedswide)

Unnamed: 0,pat_deid,med_1,med_2,med_51,med_62,med_84,med_85,med_87,med_98,med_100,...,med_542002,med_550003,med_550007,med_550008,med_550009,med_550011,med_550012,med_550013,med_575029,med_590201
0,1174,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1790,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2262,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2610,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2736,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


5132

## Prior Eye Procedures

In [167]:
dfproc=pd.read_sql_query('''select pat_deid, proc_date, code from procedure where pat_deid in (select distinct pat_deid
from lowvacohort) 
order by pat_deid''', conn)
dfproc.columns = map(str.lower, dfproc.columns)

dfproc["proc_date"]=pd.to_datetime(dfproc["proc_date"])

len(dfproc)
dfproc.head()

71690

Unnamed: 0,pat_deid,proc_date,code
0,2262,2015-11-03,66984
1,2262,2015-11-03,66984
2,2736,2016-07-20,67145
3,2736,2016-07-20,67145
4,2736,2017-01-25,67036


In [168]:
dfproc=pd.merge(dfproc,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfproc.sort_values(by=["pat_deid", "proc_date"], ascending=True, inplace=True)

#keep procedures done before low vision date
dfproc=dfproc[(dfproc["proc_date"]<=dfproc["lowvadate"])]

dfproc.head()
len(dfproc)

Unnamed: 0,pat_deid,proc_date,code,lowvadate
70,4062,2013-02-26,66984,2016-07-13
71,4062,2013-02-26,66984,2016-07-13
68,4062,2016-07-12,68761,2016-07-13
69,4062,2016-07-12,68761,2016-07-13
126,12122,2013-06-19,67028,2013-06-19


10132

In [170]:
dfproc["pivotvalue"]=1
dfprocwide=dfproc.pivot_table(values="pivotvalue", index='pat_deid', columns='code', fill_value=0)
dfprocwide.columns = ['cpt_'+str(col) for col in dfprocwide.columns.values]
dfprocwide.reset_index(inplace=True)
dfprocwide.head()
len(dfprocwide)

Unnamed: 0,pat_deid,cpt_65091,cpt_65093,cpt_65105,cpt_65175,cpt_65205,cpt_65210,cpt_65220,cpt_65222,cpt_65235,...,cpt_68760,cpt_68761,cpt_68801,cpt_68810,cpt_68815,cpt_68840,cpt_0191T,cpt_0192T,cpt_0449T,cpt_0474T
0,4062,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,12122,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13969,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,14354,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21187,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


1501

## Numeric Eye Exam Fields

### IOP 

In [171]:
dfexam=pd.read_sql_query('''select pat_deid, DATE_OF_SERVICE, tod, tos, tmethod
from examfield, examparsed 
where examfield.smartformid = examparsed.smartformid 
and not (tod is null and tos is null and tmethod is null)
and pat_deid in (select distinct pat_deid
from lowvacohort
			) 
            order by pat_deid 
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)

dfexam["exam_date"]=pd.to_datetime(dfexam["date_of_service"])
del dfexam["date_of_service"]
len(dfexam)

dfexam=pd.merge(dfexam,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfexam.sort_values(by=["pat_deid", "exam_date"], ascending=True, inplace=True)
dfexam=dfexam[(dfexam["exam_date"]<=dfexam["lowvadate"])]

len(dfexam)
dfexam.head()

279135

54880

Unnamed: 0,pat_deid,tod,tos,tmethod,exam_date,lowvadate
3,1174,"[""14""]","[""15""]","[""Tonopen""]",2009-09-17,2009-09-17
4,1174,,,,2009-09-17,2009-09-17
5,1174,,,,2009-09-17,2009-09-17
6,1174,,,,2009-09-17,2009-09-17
7,1174,,,,2009-09-17,2009-09-17


In [172]:
dft=dfexam

In [173]:
#we have to think about whether it makes sense to use maxt, or averaget or maybe both? 

### Prior vision 
Probably makes sense to use the best prior vision as a predictor of future good vision

In [195]:
dfexam=pd.read_sql_query('''select pat_deid, DATE_OF_SERVICE, 
vaoddistcc, vaoddistsc, vaoddistccph, vaoddistscph, vaosdistcc, vaosdistsc, vaosdistccph, vaosdistscph 
from examfield, examparsed 
where examfield.smartformid = examparsed.smartformid 
and not (vaoddistcc is null and vaoddistsc is null and vaosdistcc is null and vaosdistsc is null)
and pat_deid in (select distinct pat_deid from lowvacohort)
            order by pat_deid 
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["date_of_service"])
del dfexam["date_of_service"]
len(dfexam)
dfexam.head()

dfexam=pd.merge(dfexam,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfexam.sort_values(by=["pat_deid", "exam_date"], ascending=True, inplace=True)
dfexam=dfexam[(dfexam["exam_date"]<=dfexam["lowvadate"])]

len(dfexam)
dfexam.head()

141208

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date
0,1174,20/100,,NI,,20/150,,NI,,2014-03-24
1,1174,20/50,,NI,,20/100,,,,2009-09-17
2,1174,20/50,,NI,,20/100,,20/70-1,,2009-11-04
3,1174,20/40,,,,,,,,2010-10-19
4,1174,,,,,20/100,,,,2010-10-19


26070

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,lowvadate
1,1174,20/50,,NI,,20/100,,,,2009-09-17,2009-09-17
16,1790,20/50,,,,20/50,,,,2012-02-14,2012-02-14
25,2262,20/40,,NI,,20/40,,20/30,,2012-05-10,2012-05-15
26,2262,20/60,,NI,,20/50,,NI,,2012-05-15,2012-05-15
30,2610,20/100,,NI,,20/70,,NI,,2009-11-19,2009-11-19


In [196]:
dfva=dfexam
dfva["bcvalogmarod"]=dfva[["vaoddistsc", "vaoddistcc", "vaoddistscph", "vaoddistccph"]].apply(lambda x: bcvalogmar(*x), axis=1)
dfva["bcvalogmaros"]=dfva[["vaosdistsc", "vaosdistcc", "vaosdistscph", "vaosdistccph"]].apply(lambda x: bcvalogmar(*x), axis=1)

#there are some cases wehre vision was measured only in one eye. In this case we should do a last values carried forward for the patient 
dfva.sort_values(["pat_deid", "exam_date"], inplace=True)

dfva["bcvalogmarod"]=dfva[["pat_deid","bcvalogmarod"]].groupby(["pat_deid"]).ffill()["bcvalogmarod"]
dfva["bcvalogmaros"]=dfva[["pat_deid","bcvalogmaros"]].groupby(["pat_deid"]).ffill()["bcvalogmaros"]

dfva.head(20)

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,lowvadate,bcvalogmarod,bcvalogmaros
1,1174,20/50,,NI,,20/100,,,,2009-09-17,2009-09-17,0.39794,0.69897
16,1790,20/50,,,,20/50,,,,2012-02-14,2012-02-14,0.39794,0.39794
25,2262,20/40,,NI,,20/40,,20/30,,2012-05-10,2012-05-15,0.30103,0.176091
26,2262,20/60,,NI,,20/50,,NI,,2012-05-15,2012-05-15,0.477121,0.39794
30,2610,20/100,,NI,,20/70,,NI,,2009-11-19,2009-11-19,0.69897,0.544068
35,2736,,,,,20/50,,NI,,2016-07-11,2016-07-11,,0.39794
36,2736,??Unclear if NLP v LP based upon inconsistent ...,,,,,,,,2016-07-11,2016-07-11,2.886057,0.39794
37,2736,?UNclear if NLP v LP based upno inconsistent a...,,,,20/50-1,,,,2016-07-11,2016-07-11,2.886057,0.39794
81,2920,,CF at 1',,CF at 2',,CF at 0.5',,,2010-10-25,2010-10-25,2.60206,2.60206
121,3178,20/50,,20/40+2,,20/40,,20/30,,2012-10-24,2016-11-11,0.30103,0.176091


In [197]:
dfva["bcvalogmar"]=dfva[["bcvalogmarod", "bcvalogmaros"]].apply(lambda x: bcva(*x), axis=1)

In [198]:
dfva.head(10)

Unnamed: 0,pat_deid,vaoddistcc,vaoddistsc,vaoddistccph,vaoddistscph,vaosdistcc,vaosdistsc,vaosdistccph,vaosdistscph,exam_date,lowvadate,bcvalogmarod,bcvalogmaros,bcvalogmar
1,1174,20/50,,NI,,20/100,,,,2009-09-17,2009-09-17,0.39794,0.69897,0.39794
16,1790,20/50,,,,20/50,,,,2012-02-14,2012-02-14,0.39794,0.39794,0.39794
25,2262,20/40,,NI,,20/40,,20/30,,2012-05-10,2012-05-15,0.30103,0.176091,0.176091
26,2262,20/60,,NI,,20/50,,NI,,2012-05-15,2012-05-15,0.477121,0.39794,0.39794
30,2610,20/100,,NI,,20/70,,NI,,2009-11-19,2009-11-19,0.69897,0.544068,0.544068
35,2736,,,,,20/50,,NI,,2016-07-11,2016-07-11,,0.39794,0.39794
36,2736,??Unclear if NLP v LP based upon inconsistent ...,,,,,,,,2016-07-11,2016-07-11,2.886057,0.39794,0.39794
37,2736,?UNclear if NLP v LP based upno inconsistent a...,,,,20/50-1,,,,2016-07-11,2016-07-11,2.886057,0.39794,0.39794
81,2920,,CF at 1',,CF at 2',,CF at 0.5',,,2010-10-25,2010-10-25,2.60206,2.60206,2.60206
121,3178,20/50,,20/40+2,,20/40,,20/30,,2012-10-24,2016-11-11,0.30103,0.176091,0.176091


### CCT 

In [175]:
#now process CCT's 
dfexam=pd.read_sql_query('''select pat_deid, DATE_OF_SERVICE, cctod, cctos, cctdate 
from examfield, examparsed 
where examfield.smartformid = examparsed.smartformid 
and not (cctod is null and cctos is null and cctdate is null)
and pat_deid in (select distinct pat_deid
lowvacohort) 
            order by pat_deid 
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["date_of_service"])
del dfexam["date_of_service"]
len(dfexam)
dfexam=pd.merge(dfexam,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfexam.sort_values(by=["pat_deid", "exam_date"], ascending=True, inplace=True)
dfexam=dfexam[(dfexam["exam_date"]<=dfexam["lowvadate"])]
dfexam["cctdate"]=pd.to_datetime(dfexam["cctdate"])
dfexam=dfexam[(dfexam["cctdate"]<=dfexam["lowvadate"])]

len(dfexam)
dfexam.head()

dfcct=dfexam[["pat_deid", "cctod", "cctos", "cctdate"]]
dfcct=dfcct.drop_duplicates()
dfcct.head()

148938

6625

Unnamed: 0,pat_deid,cctod,cctos,cctdate,exam_date,lowvadate
0,4062,528,499,2016-07-12,2016-07-12,2016-07-13
1,4062,528,499,2016-07-12,2016-07-13,2016-07-13
26,4690,569,573,2017-08-07,2017-08-07,2017-08-07
41,11196,710,597,2015-05-29,2015-05-29,2015-05-29
104,18210,612,575,2015-08-28,2015-08-28,2015-12-15


Unnamed: 0,pat_deid,cctod,cctos,cctdate
0,4062,528,499,2016-07-12
26,4690,569,573,2017-08-07
41,11196,710,597,2015-05-29
104,18210,612,575,2015-08-28
155,48249,654,566,2015-11-19


In [238]:
dfcct["cctod"]=pd.to_numeric(dfcct["cctod"], errors='coerce')
dfcct["cctos"]=pd.to_numeric(dfcct["cctos"], errors='coerce')

In [241]:
dfcct=dfcct[~((dfcct["cctod"].isnull()) & (dfcct["cctos"].isnull()))]

### Refraction

In [176]:
#now process refractions 
dfexam=pd.read_sql_query('''select pat_deid, DATE_OF_SERVICE, wrxodsph, wrxodcyl, wrxossph, wrxoscyl, mrxauto, mrxodsph, mrxodcyl, mrxossph, mrxoscyl, finalrxodsph, finalrxodcyl, finalrxossph, finalrxoscyl
from examfield, examparsed 
where examfield.smartformid = examparsed.smartformid 
and not (wrxodsph is null and wrxodcyl is null and wrxossph is null and wrxoscyl is null and mrxodsph is null and mrxodcyl is null and mrxossph is null and mrxoscyl is null and finalrxodsph is null and finalrxodcyl is null and finalrxossph is null and finalrxoscyl is null)
and pat_deid in (select distinct pat_deid
from lowvacohort) 
            order by pat_deid 
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["date_of_service"])
del dfexam["date_of_service"]
len(dfexam)

dfexam=pd.merge(dfexam,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfexam.sort_values(by=["pat_deid", "exam_date"], ascending=True, inplace=True)
dfexam=dfexam[(dfexam["exam_date"]<=dfexam["lowvadate"])]

len(dfexam) 
dfexam.head()

46702

11395

Unnamed: 0,pat_deid,wrxodsph,wrxodcyl,wrxossph,wrxoscyl,mrxauto,mrxodsph,mrxodcyl,mrxossph,mrxoscyl,finalrxodsph,finalrxodcyl,finalrxossph,finalrxoscyl,exam_date,lowvadate
1,1174,,,,,,,1.0,,,,,,,2009-09-17,2009-09-17
2,1174,,,,,,-6.0025,,,,,,,,2009-09-17,2009-09-17
3,1174,,,,,,,,-4.0,1.0,,,,,2009-09-17,2009-09-17
12,1790,,,,,,-0.75,1.25,-0.5,1.25,,,,,2012-02-14,2012-02-14
15,2262,1.25,0.5,1.0,0.25,yes,1.25,0.25,1.5,1.0,,,,,2012-05-10,2012-05-15


In [177]:
dfexam["finalrxodsph"]=pd.to_numeric(dfexam["finalrxodsph"], errors='coerce')
dfexam["finalrxossph"]=pd.to_numeric(dfexam["finalrxossph"], errors='coerce')
dfexam["finalrxodcyl"]=pd.to_numeric(dfexam["finalrxodcyl"], errors='coerce')
dfexam["finalrxoscyl"]=pd.to_numeric(dfexam["finalrxoscyl"], errors='coerce')

dfrx=dfexam
dfrx.head()

Unnamed: 0,pat_deid,wrxodsph,wrxodcyl,wrxossph,wrxoscyl,mrxauto,mrxodsph,mrxodcyl,mrxossph,mrxoscyl,finalrxodsph,finalrxodcyl,finalrxossph,finalrxoscyl,exam_date,lowvadate
1,1174,,,,,,,1.0,,,,,,,2009-09-17,2009-09-17
2,1174,,,,,,-6.0025,,,,,,,,2009-09-17,2009-09-17
3,1174,,,,,,,,-4.0,1.0,,,,,2009-09-17,2009-09-17
12,1790,,,,,,-0.75,1.25,-0.5,1.25,,,,,2012-02-14,2012-02-14
15,2262,1.25,0.5,1.0,0.25,yes,1.25,0.25,1.5,1.0,,,,,2012-05-10,2012-05-15


In [178]:
#calculate spherical equivalent 
def spheqv(sph, cyl): 
    if math.isnan(cyl): 
        return sph 
    if math.isnan(sph): 
        return 0.5*cyl 
    else: 
        sphericalequivalent = sph + 0.5*cyl 
        return sphericalequivalent 

In [179]:
dfrx["wrxodspheqv"]=dfrx[["wrxodsph", "wrxodcyl"]].apply(lambda x: spheqv(*x), axis=1)
dfrx["wrxosspheqv"]=dfrx[["wrxossph", "wrxoscyl"]].apply(lambda x: spheqv(*x), axis=1)
dfrx["mrxodspheqv"]=dfrx[["mrxodsph", "mrxodcyl"]].apply(lambda x: spheqv(*x), axis=1)
dfrx["mrxosspheqv"]=dfrx[["mrxossph", "mrxoscyl"]].apply(lambda x: spheqv(*x), axis=1)
dfrx["finalrxodspheqv"]=dfrx[["finalrxodsph", "wrxodcyl"]].apply(lambda x: spheqv(*x), axis=1)
dfrx["finalrxosspheqv"]=dfrx[["finalrxossph", "wrxoscyl"]].apply(lambda x: spheqv(*x), axis=1)
dfrx.head(20)

Unnamed: 0,pat_deid,wrxodsph,wrxodcyl,wrxossph,wrxoscyl,mrxauto,mrxodsph,mrxodcyl,mrxossph,mrxoscyl,...,finalrxossph,finalrxoscyl,exam_date,lowvadate,wrxodspheqv,wrxosspheqv,mrxodspheqv,mrxosspheqv,finalrxodspheqv,finalrxosspheqv
1,1174,,,,,,,1.0,,,...,,,2009-09-17,2009-09-17,,,0.5,,,
2,1174,,,,,,-6.0025,,,,...,,,2009-09-17,2009-09-17,,,-6.0025,,,
3,1174,,,,,,,,-4.0,1.0,...,,,2009-09-17,2009-09-17,,,,-3.5,,
12,1790,,,,,,-0.75,1.25,-0.5,1.25,...,,,2012-02-14,2012-02-14,,,-0.125,0.125,,
15,2262,1.25,0.5,1.0,0.25,yes,1.25,0.25,1.5,1.0,...,,,2012-05-10,2012-05-15,1.5,1.125,1.375,2.0,0.25,0.125
30,3178,,,,,yes,,,-2.5,4.0,...,,,2012-10-24,2016-11-11,,,,-0.5,,
29,3178,,,,,,,,,,...,-2.0,3.25,2012-11-30,2016-11-11,,,,,-2.25,-2.0
31,3178,,,,,,-3.5,2.75,-2.5,2.5,...,,,2012-11-30,2016-11-11,,,-2.125,-1.25,,
32,3178,-2.25,2.5,-2.0,3.25,yes,,,,,...,,,2013-05-20,2016-11-11,-1.0,-0.375,,,1.25,1.625
33,3178,,,,,,,,-2.75,3.5,...,,,2013-05-20,2016-11-11,,,,-1.0,,


In [180]:
dfrx=dfrx[["pat_deid", "exam_date", "wrxodspheqv", "wrxosspheqv", "mrxodspheqv", "mrxosspheqv", "finalrxodspheqv", "finalrxosspheqv"]]

In [183]:
dfrx.head()
len(dfrx)

Unnamed: 0,pat_deid,exam_date,wrxodspheqv,wrxosspheqv,mrxodspheqv,mrxosspheqv,finalrxodspheqv,finalrxosspheqv
1,1174,2009-09-17,,,0.5,,,
2,1174,2009-09-17,,,-6.0025,,,
3,1174,2009-09-17,,,,-3.5,,
12,1790,2012-02-14,,,-0.125,0.125,,
15,2262,2012-05-10,1.5,1.125,1.375,2.0,0.25,0.125


11395

### CDR

In [235]:
dfexam=pd.read_sql_query('''select pat_deid, DATE_OF_SERVICE, 
sleodll, sleosll, sleodcs, sleoscs, sleodk, sleosk, sleodac, sleosac, sleodiris, sleosiris, sleodlens, sleoslens, sleodvit, sleosvit, feoddisc, feosdisc, feodcdr, feoscdr, feodmac, feosmac, feodvess, feosvess, feodperiph, feosperiph
from examfield, examparsed 
where examfield.smartformid = examparsed.smartformid 
and not (sleodll is null and sleosll is null and sleodcs is null and sleoscs is null and sleodk is null and 
sleosk is null and sleodac is null and sleosac is null and sleodiris is null and sleosiris is null and
sleodlens is null and sleoslens is null and sleodvit is null and sleosvit is null and feoddisc is null and
feosdisc is null and feodcdr is null and feoscdr is null and feodmac is null and feosmac is null and
feodvess is null and feosvess is null and feodperiph is null and feosperiph is null)
and pat_deid in (select distinct pat_deid
from lowvacohort
			) 
            order by pat_deid 
''', conn) 
dfexam.columns = map(str.lower, dfexam.columns)
dfexam["exam_date"]=pd.to_datetime(dfexam["date_of_service"])
del dfexam["date_of_service"]
len(dfexam)

dfexam=pd.merge(dfexam,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfexam.sort_values(by=["pat_deid", "exam_date"], ascending=True, inplace=True)
dfexam=dfexam[(dfexam["exam_date"]<=dfexam["lowvadate"])]

len(dfexam) 
dfexam.head()

36710

8124

Unnamed: 0,pat_deid,sleodll,sleosll,sleodcs,sleoscs,sleodk,sleosk,sleodac,sleosac,sleodiris,...,feodcdr,feoscdr,feodmac,feosmac,feodvess,feosvess,feodperiph,feosperiph,exam_date,lowvadate
9,2262,1+ Blepharitis,1+ Blepharitis,White and quiet,White and quiet,Pterygium temporal 2.9mm,pterygium Nasl 1.9,Deep and quiet,Deep and quiet,Round and reactive,...,0.3,0.3,Epiretinal membrane,Normal,Normal,Normal,Normal,Normal,2012-05-10,2012-05-15
10,2610,Normal,Normal,White and quiet,White and quiet,Clear,Clear,Deep and quiet,Deep and quiet,Round and reactive,...,,,,,,,,,2009-11-19,2009-11-19
11,2610,,,,,,,,,,...,,,hyperpigmented macular scars,hyperpigmented macular scars,Normal,Normal,,,2009-11-19,2009-11-19
44,3178,Normal,Normal,White and quiet,White and quiet,RK incisions and LASIK flap,RK incisions and LASIK flap,Deep and quiet,Deep and quiet,Round and reactive,...,,,,,,,,,2012-12-11,2016-11-11
43,3178,Normal,Normal,White and quiet,White and quiet,RK scars,RK scars,Deep and quiet,Deep and quiet,Round and reactive,...,0.4,,"see diagram, Drusen",Drusen,Normal,Normal,"Drusen, Focal RPE hypopigmentation near fovea;...","Drusen, More pronounced vitreous condensation;...",2016-11-11,2016-11-11


In [185]:
dfexam["feodcdr"]=pd.to_numeric(dfexam["feodcdr"], errors='coerce')
dfexam["feoscdr"]=pd.to_numeric(dfexam["feoscdr"], errors='coerce')

In [186]:
dfcdr=dfexam[["pat_deid", "exam_date", "feodcdr", "feoscdr"]]

dfcdr.head()

Unnamed: 0,pat_deid,exam_date,feodcdr,feoscdr
9,2262,2012-05-10,0.3,0.3
10,2610,2009-11-19,,
11,2610,2009-11-19,,
44,3178,2012-12-11,,
43,3178,2016-11-11,0.4,


In [192]:
dfcdr=dfcdr[(dfcdr['feodcdr'].notnull()) | (dfcdr['feoscdr'].notnull())]

## Text Exam Fields 

In [236]:
del dfexam["feodcdr"]
del dfexam["feoscdr"]
dfexam.columns

Index(['pat_deid', 'sleodll', 'sleosll', 'sleodcs', 'sleoscs', 'sleodk',
       'sleosk', 'sleodac', 'sleosac', 'sleodiris', 'sleosiris', 'sleodlens',
       'sleoslens', 'sleodvit', 'sleosvit', 'feoddisc', 'feosdisc', 'feodmac',
       'feosmac', 'feodvess', 'feosvess', 'feodperiph', 'feosperiph',
       'exam_date', 'lowvadate'],
      dtype='object')

## Get notes 

In [229]:
dfnotes=pd.read_sql_query('''select pat_deid, note_deid, substr(encounter_date, 0, 10) as enc_date, note, note_desc from notes 
where pat_deid in (select distinct pat_deid
from lowvacohort
			) 
order by pat_deid''', conn)
dfnotes.columns = map(str.lower, dfnotes.columns)

dfnotes["enc_date"]=pd.to_datetime(dfnotes["enc_date"])

dfnotes=pd.merge(dfnotes,lowvacohortsqltable[["pat_deid", "lowvadate"]], left_on="pat_deid", right_on="pat_deid", how="right")
dfnotes.sort_values(by=["pat_deid", "enc_date"], ascending=True, inplace=True)
dfpreopnotes=dfnotes[(dfnotes["enc_date"]<=dfnotes["lowvadate"])]

len(dfpreopnotes) 
dfpreopnotes.head()


67690

Unnamed: 0,pat_deid,note_deid,enc_date,note,note_desc,lowvadate
0,1174,28528746.0,2009-01-12,"January 12, 2009 Peter R Egbert, MD R...",Outpatient Letter,2009-09-17
1,1174,28528751.0,2009-01-12,Department of Ophthalmology California Vitreo...,RTF Letter,2009-09-17
2,1174,58842312.0,2009-09-17,Assessment and Plan: 1. Proliferative diabeti...,Progress Notes,2009-09-17
22,1790,174426007.0,2012-02-14,Retina Service Outpatient Encounter Steven ...,Progress Notes,2012-02-14
23,1790,174435847.0,2012-02-14,Retina Service Outpatient Encounter Steven ...,RTF Letter,2012-02-14


In [8]:
conn.close() 


# Export to new database

In [12]:
conn = sqlite3.connect('lowva\lowva.db')
conn.text_factory = str
cur = conn.cursor()

In [232]:

dfpreopnotes[["pat_deid", "note_deid", "enc_date", "note", "note_desc"]].to_sql('notes', conn, if_exists="replace", index=False)
conn.commit() 



In [201]:
dfpt.to_sql('demographics', conn, if_exists='replace', index=False)
conn.commit()

In [244]:
dfexam.to_sql('examslefe', conn, if_exists='replace', index=False)
conn.commit()

In [245]:
dfcct.to_sql('examcct', conn, if_exists='replace', index=False)
conn.commit()

In [209]:
dfva.to_sql('examva', conn, if_exists='replace', index=False)
dft.to_sql('examiop', conn, if_exists='replace', index=False)
dfrx.to_sql('examrx', conn, if_exists='replace', index=False)
dfcdr.to_sql('examcdr', conn, if_exists='replace', index=False)
conn.commit()

In [210]:
dfprocwide.to_sql('cpt', conn, if_exists="replace", index=False)
conn.commit()

In [219]:
dfproc[["pat_deid", "proc_date", "code", "pivotvalue"]].to_sql('cptlong', conn, if_exists="replace", index=False)

In [215]:
dfmeds[["pat_deid", "medication_id", "rx_date", "pivotvalue"]].to_sql('medslong', conn, if_exists='replace', index=False)

In [208]:
pd.merge(lowvacohortsqltable,outcome[["pat_deid", "outcome"]], on="pat_deid").to_sql('outcome', conn, if_exists='replace', index=False)
conn.commit()

In [13]:
dfdx[["pat_deid", "dx_date", "icd9_list", "icd10_list", "pivotvalue"]].to_sql('dxlong', conn, if_exists='replace', index=False)
conn.commit() 

In [221]:
dfdx.head()

Unnamed: 0,pat_deid,icd10_list,dx_date,lowvadate,pivotvalue
0,1174,H35.379,2009-09-17,2009-09-17,1
3,1174,H43.819,2009-09-17,2009-09-17,1
5,1174,E11.3599,2009-09-17,2009-09-17,1
9,1174,Z96.1,2009-09-17,2009-09-17,1
19,1174,H18.50,2009-09-17,2009-09-17,1


In [14]:
conn.close() 