The goal of this notebook is to get the nhanes df formatted properly so that I can make predictions for missing data for other populations (eg waist, education for the Kaiser population)

In [10]:
import os
import numpy as np
import pandas as pd

desktopDir = "/Users/deligkaris.1/Desktop"
microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/CODE/microsim"
dataDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/NOTEBOOKS/DATA"
os.chdir(microsimDir)

from microsim.population_factory import PopulationFactory

In [11]:
nhanesDf = PopulationFactory.get_nhanesDf()
nhanesDf.head()

Unnamed: 0,name,index,WTINT2YR,a1c,age,antiHypertensiveCount,bmi,diedBy2015,gender,hdl,...,alcoholPerWeek,completedInterview,missingSBP,htn3,htn4,meanSBP3,jnc8,raceEthnicity,smokingStatus,education
0,0,0,102641.406474,5.1,22.0,0.0,23.3,0,1.0,41.0,...,0,1,0,0,0,110.666667,0,3,0,3
1,1,3,127351.373299,4.9,44.0,0.0,23.2,0,2.0,28.0,...,3,1,0,0,0,118.0,0,3,0,4
2,2,8,14391.77847,5.4,21.0,1.0,20.1,0,1.0,43.0,...,3,1,0,0,0,124.666667,0,5,0,3
3,3,11,26960.774346,5.6,43.0,0.0,33.3,0,2.0,73.0,...,3,1,0,0,0,102.0,0,4,2,3
4,4,13,24912.668432,5.0,80.0,1.0,33.9,1,1.0,54.0,...,2,1,0,1,1,97.0,0,3,0,5


In [12]:
nhanesDf0 = pd.read_stata("microsim/data/fullyImputedDataset.dta")
nhanesDf0.head()

Unnamed: 0,level_0,index,WTINT2YR,a1c,age,antiHypertensive,bmi,diedBy2015,gender,hdl,...,alcoholPerWeek,completedInterview,missingSBP,htn3,htn4,meanSBP3,jnc8,raceEthnicity,smokingStatus,education
0,0,0,102641.406474,5.1,22.0,0.0,23.3,0,1.0,41.0,...,0.0,1,0,0,0,110.666667,0,3,0,3
1,1,3,127351.373299,4.9,44.0,0.0,23.2,0,2.0,28.0,...,21.0,1,0,0,0,118.0,0,3,0,4
2,2,8,14391.77847,5.4,21.0,1.0,20.1,0,1.0,43.0,...,14.0,1,0,0,0,124.666667,0,5,0,3
3,3,11,26960.774346,5.6,43.0,0.0,33.3,0,2.0,73.0,...,21.0,1,0,0,0,102.0,0,4,2,3
4,4,13,24912.668432,5.0,80.0,1.0,33.9,1,1.0,54.0,...,7.0,1,0,1,1,97.0,0,3,0,5


In [13]:
#include the data for alcohol per week as found in the unmodified NHANES file in case it is easier to obtain
#a linear model and then do the categories as opposed to doing an ordered logistic regression model...
nhanesDf["alcoholPerWeekCon"] = nhanesDf0["alcoholPerWeek"]
nhanesDf[["alcoholPerWeek","alcoholPerWeekCon"]].head()

Unnamed: 0,alcoholPerWeek,alcoholPerWeekCon
0,0,0.0
1,3,21.0
2,3,14.0
3,3,21.0
4,2,7.0


In [14]:
from microsim.risk_factor import StaticRiskFactorsType, DynamicRiskFactorsType
from microsim.treatment import DefaultTreatmentsType
srf = [x.value for x in StaticRiskFactorsType if x.value in nhanesDf.columns]
drf = [x.value for x in DynamicRiskFactorsType if x.value in nhanesDf.columns]
dt = [x.value for x in DefaultTreatmentsType if x.value in nhanesDf.columns]
nhanesDf["anyPhysicalActivity"] = nhanesDf["anyPhysicalActivity"].astype('int')
nhanesDf["statin"] = nhanesDf["statin"].astype('int')
nhanesDf["antiHypertensiveCount"] = nhanesDf["antiHypertensiveCount"].astype('int')
#nhanesDf["age"] = nhanesDf["age"].astype('int')
nhanesDf["gender"] = nhanesDf["gender"].astype('int')

nhanesDf[ srf+drf+dt + ["year", "WTINT2YR"]]

Unnamed: 0,raceEthnicity,education,gender,smokingStatus,age,sbp,dbp,a1c,hdl,ldl,...,totChol,bmi,anyPhysicalActivity,waist,alcoholPerWeek,creatinine,statin,antiHypertensiveCount,year,WTINT2YR
0,3,3,1,0,22.0,110.666667,74.666667,5.1,41.0,110.0,...,168.0,23.30,0,81.0,0,0.91,0,0,2011,102641.406474
1,3,4,2,0,44.0,118.000000,60.000000,4.9,28.0,151.0,...,190.0,23.20,1,80.1,3,0.89,0,0,2011,127351.373299
2,5,3,1,0,21.0,124.666667,78.000000,5.4,43.0,73.0,...,132.0,20.10,0,69.6,3,0.87,0,1,2011,14391.778470
3,4,3,2,2,43.0,102.000000,71.333333,5.6,73.0,68.0,...,169.0,33.30,0,120.4,3,0.68,0,0,2011,26960.774346
4,3,5,1,0,80.0,98.000000,62.666667,5.0,54.0,137.0,...,203.0,33.90,0,116.5,2,0.87,0,1,2011,24912.668432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59199,4,3,1,1,64.0,148.000000,42.000000,5.2,34.0,101.0,...,152.0,43.27,0,149.5,2,1.98,0,3,2017,11804.993503
59200,5,3,2,0,70.0,139.333333,73.333333,7.4,60.0,43.0,...,119.0,20.00,1,82.2,2,0.70,0,0,2017,16896.276203
59201,1,3,1,1,42.0,120.666667,75.333333,5.9,49.0,105.0,...,182.0,35.80,1,114.8,3,0.92,0,0,2017,61630.380013
59202,4,5,2,0,41.0,116.000000,70.666667,5.2,54.0,108.0,...,172.0,26.10,1,86.4,0,0.69,0,0,2017,17160.895269


In [15]:
#this is the df I will use to train the models on
nhanesDf[ srf+drf+dt + ["year", "WTINT2YR", "alcoholPerWeekCon"]].to_csv(dataDir+"/nhanesDf.csv")

In [None]:
#as I was working on the models I noticed that the NHANES df has 0 rows on ONETOSIX alcohol category
#that is consistent with the raw data from nhanes file
nhanesPop = pd.read_stata(microsimDir + "/microsim/data/fullyImputedDataset.dta")
nhanesPop["alcoholPerWeek"].value_counts()