The goal of this notebook is to get the Person objects and save them as a df (I need a microsim kernel for this). At the next stage I will need to use python modules not part of the microsim kernel hence the need to split this into steps....

At the end of this notebook I also looked at the bounds on risk factors that I see in the NHANES dataframe. These bounds help me set the bounds in risk_model_repository.

In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

from microsim.population import NHANESDirectSamplePopulation, get_nhanes_population

microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM"
os.chdir(microsimDir+"/CODE/microsim")

year = 1999

In [2]:
#I had 2 options: either sample now and process the large sample from the beginning 
# or process the small nhanes set as is and sample when I need to at a later point
# choice 1 does not make person objects independent of each other since they originate from the same nhanes df row
# so singular covariance matrices are still going to exist, I am just avoiding doing the work to sample later....
# but at an increased, much increased, computational cost

#choice 1
#%%time
#pop = NHANESDirectSamplePopulation(popSize, 1999)
#pop.advance(5, None, nWorkers=10)

#choice 2
pop = get_nhanes_population(1999)
pop.advance(1)

In [3]:
pop.print_baseline_summary()

                                                     min    0.25   med    0.75    max   mean    sd
                                               age   18.0   29.0   45.0   64.0   85.0   46.9   20.5
                                               sbp   72.7  110.7  121.3  136.7  231.3  126.1   21.6
                                               dbp   40.0   64.0   71.0   78.7  132.0   71.5   11.7
                                               a1c    2.5    5.0    5.3    5.6   15.1    5.5    1.1
                                               hdl    8.0   40.0   48.0   60.0  151.0   51.4   15.6
                                               ldl   28.0   97.0  119.0  144.0  354.0  122.3   37.2
                                              trig   12.0   82.0  119.0  180.0  2141.0  147.4  114.7
                                           totChol   72.0  172.0  198.0  228.0  525.0  201.5   43.3
                                               bmi   12.0   23.7   27.0   31.3   67.8   28.1    6.3


In [4]:
df = pop.get_all_person_years_as_df()

In [5]:
df.head()

Unnamed: 0,name,raceEthnicity,education,gender,smokingStatus,afib,pvd,age,alcoholPerWeek,hdl,bmi,totChol,trig,a1c,ldl,waist,creatinine,sbp,dbp,anyPhysicalActivity,statin,antiHypertensiveCount
0,41787,3,5,1,0,False,False,77.0,2,54.0,24.9,215.0,128.0,4.7,136.0,98.0,0.7,100.666667,56.666667,0,False,0.0
1,41788,3,5,1,1,False,False,49.0,3,42.0,29.1,279.0,347.0,5.5,168.0,99.9,0.8,122.0,82.666667,1,False,1.0
2,41789,5,4,2,0,False,False,19.0,0,61.0,22.56,153.0,80.0,5.1,71.0,81.6,0.5,114.666667,68.0,1,False,0.0
3,41790,4,2,2,1,False,False,59.0,0,105.0,29.39,245.0,62.0,5.8,127.0,90.7,0.6,125.333333,80.0,1,False,0.0
4,41791,4,3,1,2,False,False,43.0,2,51.0,30.94,140.0,45.0,5.5,80.0,108.0,0.9,145.333333,96.0,1,False,0.0


In [6]:
df.to_csv(microsimDir+"/NOTEBOOKS/DATA/nhanes-as-gaussians-01.csv", index=False)

In [None]:
#I used the following to set bounds in risk factors when a Person object is created through the risk_model_repository

In [7]:
categoricalVars = ["gender", "smokingStatus", "raceEthnicity", "statin",'education',
                  'alcoholPerWeek','anyPhysicalActivity','antiHypertensiveCount']
continuousVars = ['age', 'hdl', 'bmi', 'totChol', 'trig', 'a1c', 'ldl', 'waist', 'creatinine', 'sbp', 'dbp']
df[continuousVars].describe()

Unnamed: 0,age,hdl,bmi,totChol,trig,a1c,ldl,waist,creatinine,sbp,dbp
count,5448.0,5448.0,5448.0,5448.0,5448.0,5448.0,5448.0,5448.0,5448.0,5448.0,5448.0
mean,47.19989,51.400881,28.067777,201.666116,147.770742,5.524725,122.4163,95.721366,0.761434,126.366495,71.43955
std,20.68048,15.592493,6.265425,43.294339,114.824628,1.082789,37.236461,15.460963,0.548822,21.798294,11.711604
min,18.0,8.0,12.04,72.0,12.0,2.5,28.0,58.5,0.2,72.666667,40.0
25%,29.0,40.0,23.7275,172.0,82.0,5.0,97.0,84.6,0.6,111.333333,64.0
50%,45.0,48.0,26.98,198.0,120.0,5.3,119.0,94.8,0.7,122.0,71.0
75%,65.0,60.0,31.32,228.0,180.0,5.6,144.0,105.4,0.9,137.0,78.666667
max,85.0,151.0,67.83,525.0,2141.0,15.1,354.0,173.4,11.8,266.0,132.0


In [8]:
nhanesDf = pd.read_stata(microsimDir + "/CODE/microsim/microsim/data/fullyImputedDataset.dta")
pd.set_option('display.float_format', '{:.3f}'.format)
nhanesDf.describe().loc["min"]*0.9

level_0                      0.000
index                        0.000
WTINT2YR                  1086.774
a1c                          1.800
age                         16.200
antiHypertensive             0.000
bmi                         10.836
diedBy2015                   0.000
gender                       0.900
hdl                          5.400
ldl                          8.100
monthsToDeath                0.000
monthsToDeath2               0.000
otherLipidLowering           0.000
selfReportCurrentHtnMed      0.000
selfReportHtn                0.000
selfReportMI                 0.000
selfReportMIAge              0.900
selfReportStroke             0.000
selfReportStrokeAge          0.000
serumCreatinine              0.090
statin                       0.000
timeInUS                     0.900
tot_chol                    53.100
trig                         9.000
ucod                         0.000
urineAlbumin                 0.180
waist                       49.950
weight              

In [9]:
nhanesDf.describe().loc["max"]*1.1

level_0                    65123.300
index                      12141.800
WTINT2YR                  476393.506
a1c                           20.680
age                           93.500
antiHypertensive               7.700
bmi                          143.231
diedBy2015                     1.100
gender                         2.200
hdl                          248.600
ldl                          691.900
monthsToDeath                168.300
monthsToDeath2               167.200
otherLipidLowering             2.200
selfReportCurrentHtnMed        1.100
selfReportHtn                  1.100
selfReportMI                   1.100
selfReportMIAge           109998.900
selfReportStroke               1.100
selfReportStrokeAge       109998.900
serumCreatinine               19.580
statin                         2.200
timeInUS                     108.900
tot_chol                     894.300
trig                        4656.300
ucod                         111.100
urineAlbumin               26884.000
w