# Notes
This is already taking me 2 hours
- Authors did not create requirements.txt and did not write down package versions.
- XGBoost version is 0.90

Features used in this model (NHANES dataset, ENABL AGE-Q):
- Age 
    - (DEMOGRAPH, RIDAGEYR, 80+ is coded as 80)
    - **DEMO_H** 
- Systolic blood pressure 
    - (EXAM, BPXSY1, BPXSY2, BPXSY3, BPXSY4)
    - **BPX_H**
- Arm Circumference 
    - (EXAM, BMIARMC)
    - **BMX_H**
- Ratio of family income to poverty  
    - (DEMO,  INDFMPIR)
    - **DEMO_H**
- General health condition 
    - (Q, HSD010)
    - **HSQ_H**
- Number of months working in the main job 
    - (Q, OCD270)
    - **OCQ_H**
- Sex 
    - (DEMOGRAPHS, RIAGENDR)
    - **DEMO_H**
- Education Level - Adults 20+ 
    - (DEMO, DMDEDUC2/DMDEDUC3)
    - **DEMO_H**
- Require special healthcare equipment 
    - (Q, PFQ033)
    - **PFQ_H**
- Self-reported greatest weight 
    - (Q, WHD140)
    - **WHQ_H**
- Avg # alcoholic drinks/day - past 12 months 
    - (Q, ALQ130)
    - **ALQ_H**
- Smoked at least 100 cigarettes in life 
    - (Q, SMQ020)
    - **SMQ_H**
- Shortness of breath on stairs/inclines 
    - (Q, CDQ010)
    - **CDQ_H**
- Marital Status - Widowd 
    - (DEMO DMDMARTL)
    - **DEMO_H**
- Number of rooms in home 
    - (Q, HOD050)
    - **HOQ_H**
- Diastolic blood pressure 
    - (EXAM, BPXDI1, BPXDI2, BPXDI3, BPXDI4)
    - **BPX_H**
- Self-reported weight-age 25, 
    - (Q, WHD120)
    - **WHQ_H**
- Duration of longest job 
    - (Q, OCD395)
    - **OCQ_H**
- Race - Non-Hispanic White 
    - (DEMO, RIDRETH1/RIDRETH3)
    - **DEMO_H**
- Not a citizen of the US 
    - (DEMO, DMDCITZN)
    - **DEMO_H**

In [1]:
import pickle
import glob

import pandas as pd
import numpy as np


In [2]:
import xgboost

xgboost.__version__

'0.90'

### Convert data

In [3]:
# Convert data
import xport

xpt_paths = glob.glob('data/*.XPT')

for path in xpt_paths:
    with open(path, 'rb') as xpt_file:
        reader = xport.XportReader(xpt_file)
        save_path = path.replace('XPT', 'csv')
        reader.to_csv(save_path, index=False)

### Load model

In [4]:
# Load moel
model_path = 'models/NHANES_All-cause-Ques_Top20_features_model.pickle.dat'
model = pickle.load(open(model_path, 'rb'))

In [5]:
model

XGBRegressor(learning_rate=0.01, max_depth=5, missing=nan, n_estimators=10000,
             objective='survival:cox', subsample=0.5)

### Load data

In [6]:
glob.glob('data/*.csv')

['data\\ALQ_H.csv',
 'data\\BMX_H.csv',
 'data\\BPX_H.csv',
 'data\\CDQ_G.csv',
 'data\\CDQ_H.csv',
 'data\\DEMO_H.csv',
 'data\\HOQ_H.csv',
 'data\\HSQ_H.csv',
 'data\\OCQ_H.csv',
 'data\\PFQ_H.csv',
 'data\\SMQ_H.csv',
 'data\\WHQ_H.csv']

In [7]:
# Load data
alcohol = pd.read_csv('data/ALQ_H.csv', index_col=0)
demographics = pd.read_csv('data/DEMO_H.csv', index_col=0)
body_measures = pd.read_csv('data/BMX_H.csv', index_col=0)
blood_pressure = pd.read_csv('data/BPX_H.csv', index_col=0)
cardiovascular = pd.read_csv('data/CDQ_H.csv', index_col=0)
housing = pd.read_csv('data/HOQ_H.csv', index_col=0)
current_health = pd.read_csv('data/HSQ_H.csv', index_col=0)
occupation = pd.read_csv('data/OCQ_H.csv', index_col=0)
physical = pd.read_csv('data/PFQ_H.csv', index_col=0)
smoking = pd.read_csv('data/SMQ_H.csv', index_col=0)
weight_hist = pd.read_csv('data/WHQ_H.csv', index_col=0)

In [8]:
# Get variables
age = demographics.loc[:, 'RIDAGEYR']
sys_blood = blood_pressure.loc[:, 'BPXSY1']
arm_c = body_measures.loc[:, 'BMIARMC']
ratio_income_poverty = demographics.loc[:, 'INDFMPIR']
general_health = current_health.loc[:, 'HSD010']
months_main_job = occupation.loc[:, 'OCD270']
sex = demographics.loc[:, 'RIAGENDR']
education = demographics.loc[:, 'DMDEDUC2']
spec_equip = physical.loc[:, 'PFQ033']
greatest_weight = weight_hist.loc[:, 'WHD140']
avg_alc = alcohol.loc[:, 'ALQ130']
cigs = smoking.loc[:, 'SMQ020']
breath = cardiovascular.loc[:, 'CDQ010']
married = demographics.loc[:, 'DMDMARTL']
rooms = housing.loc[:, 'HOD050']
dias_blood = blood_pressure.loc[:, 'BPXDI1']
weight_25 = weight_hist.loc[:, 'WHD120']
longest_job = occupation.loc[:, 'OCD395']
race = demographics.loc[:, 'RIDRETH1']
us_citizen = demographics.loc[:, 'DMDCITZN']

In [9]:
# Look for some examples by intersection
samples = pd.concat([
    age, sys_blood, arm_c,
    ratio_income_poverty, general_health,
    months_main_job, sex, education, 
    spec_equip, greatest_weight, avg_alc,
    cigs, breath, married, rooms, dias_blood,
    weight_25, longest_job, race, us_citizen
], axis=1)

samples.head(3)

Unnamed: 0_level_0,RIDAGEYR,BPXSY1,BMIARMC,INDFMPIR,HSD010,OCD270,RIAGENDR,DMDEDUC2,PFQ033,WHD140,ALQ130,SMQ020,CDQ010,DMDMARTL,HOD050,BPXDI1,WHD120,OCD395,RIDRETH1,DMDCITZN
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
73557.0,69.0,122.0,,0.84,2.0,,1.0,3.0,,270.0,1.0,1.0,2.0,4.0,4.0,72.0,200.0,204.0,4.0,1.0
73558.0,54.0,156.0,,1.78,4.0,420.0,1.0,3.0,,250.0,4.0,1.0,2.0,1.0,7.0,62.0,250.0,,3.0,1.0
73559.0,72.0,140.0,,4.51,3.0,,1.0,4.0,,228.0,,1.0,2.0,1.0,6.0,90.0,190.0,216.0,3.0,1.0


In [10]:
samples.dtypes

RIDAGEYR    float64
BPXSY1      float64
BMIARMC     float64
INDFMPIR    float64
HSD010      float64
OCD270      float64
RIAGENDR    float64
DMDEDUC2    float64
PFQ033      float64
WHD140      float64
ALQ130      float64
SMQ020      float64
CDQ010      float64
DMDMARTL    float64
HOD050      float64
BPXDI1      float64
WHD120      float64
OCD395      float64
RIDRETH1    float64
DMDCITZN    float64
dtype: object

In [11]:
samples.isna().sum(axis=1).idxmin()

76119.0

In [12]:
example = samples.loc[76119.0, :]

In [13]:
# Fill with 0 as probably no special equipment needed
example = example.fillna(0)

In [14]:
model.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'importance_type': 'gain',
 'learning_rate': 0.01,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 10000,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'survival:cox',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 0.5,
 'verbosity': 1}

In [39]:
from xgboost.sklearn import XGBRegressor

In [35]:
type(model)

xgboost.sklearn.XGBRegressor

In [16]:
mapping = {
    "DMDCITZN":"Demographics_Citizenship_2.0", 
    "CDQ010":"Questionnaire_ShortnessOfBreath_2.0", 
    "RIAGENDR":"Demographics_Gender_2.0", 
    "DMDEDUC2":"Demographics_Education", 
    "OCD270":"Questionnaire_JobMonths", 
    "HOD050":"Questionnaire_RoomsInHome", 
    "WHD120":"Questionnaire_SelfReportedWeightAge25", 
    "DMDMARTL":"Demographics_MaritalStatus_2.0", 
    "RIDAGEYR":"Demographics_Age", 
    "PFQ033":"Questionnaire_SpecialHealthCareEquipment_2.0", 
    "BPXSY1":"Examination_BPSystolic2", 
    "INDFMPIR":"Demographics_IncomeRatio", 
    "BPXDI1":"Examination_BPDiastolic3",
    "OCD395":"Questionnaire_LongestJobDuration",
    "ALQ130":"Questionnaire_AlcoholFreqDays",
    "HSD010":"Questionnaire_GeneralHealth", 
    "BMIARMC":"Examination_ArmCircum",
    "RIDRETH1":"Demographics_RaceEthnicity_3.0",
    "WHD140":"Questionnaire_SelfReportedGreatestWeight",
    "SMQ020":"Questionnaire_100Cigarettes_2.0" 
}

In [17]:
example.index = example.index.map(mapping)

In [18]:
example

Demographics_Age                                 49.00
Examination_BPSystolic2                         100.00
Examination_ArmCircum                             1.00
Demographics_IncomeRatio                          0.75
Questionnaire_GeneralHealth                       3.00
Questionnaire_JobMonths                          96.00
Demographics_Gender_2.0                           1.00
Demographics_Education                            4.00
Questionnaire_SpecialHealthCareEquipment_2.0      0.00
Questionnaire_SelfReportedGreatestWeight        215.00
Questionnaire_AlcoholFreqDays                    12.00
Questionnaire_100Cigarettes_2.0                   1.00
Questionnaire_ShortnessOfBreath_2.0               2.00
Demographics_MaritalStatus_2.0                    5.00
Questionnaire_RoomsInHome                         7.00
Examination_BPDiastolic3                         62.00
Questionnaire_SelfReportedWeightAge25           160.00
Questionnaire_LongestJobDuration                240.00
Demographi

In [19]:
example = pd.DataFrame(example).T

In [27]:
order = ['Demographics_Age', 'Examination_BPSystolic2', 'Examination_ArmCircum', 'Demographics_IncomeRatio', 'Questionnaire_GeneralHealth', 'Questionnaire_JobMonths', 'Demographics_Gender_2.0', 'Demographics_Education', 'Questionnaire_SpecialHealthCareEquipment_2.0', 'Questionnaire_SelfReportedGreatestWeight', 'Questionnaire_AlcoholFreqDays', 'Questionnaire_100Cigarettes_2.0', 'Questionnaire_ShortnessOfBreath_2.0', 'Demographics_MaritalStatus_2.0', 'Questionnaire_RoomsInHome', 'Examination_BPDiastolic3', 'Questionnaire_SelfReportedWeightAge25', 'Questionnaire_LongestJobDuration', 'Demographics_RaceEthnicity_3.0', 'Demographics_Citizenship_2.0']
example = example[order]
example

Unnamed: 0,Demographics_Age,Examination_BPSystolic2,Examination_ArmCircum,Demographics_IncomeRatio,Questionnaire_GeneralHealth,Questionnaire_JobMonths,Demographics_Gender_2.0,Demographics_Education,Questionnaire_SpecialHealthCareEquipment_2.0,Questionnaire_SelfReportedGreatestWeight,Questionnaire_AlcoholFreqDays,Questionnaire_100Cigarettes_2.0,Questionnaire_ShortnessOfBreath_2.0,Demographics_MaritalStatus_2.0,Questionnaire_RoomsInHome,Examination_BPDiastolic3,Questionnaire_SelfReportedWeightAge25,Questionnaire_LongestJobDuration,Demographics_RaceEthnicity_3.0,Demographics_Citizenship_2.0
76119.0,49.0,100.0,1.0,0.75,3.0,96.0,1.0,4.0,0.0,215.0,12.0,1.0,2.0,5.0,7.0,62.0,160.0,240.0,4.0,1.0


In [28]:
model.predict(example)

array([1.7734985], dtype=float32)

In [29]:
model.get_shap_age(example)

AttributeError: 'XGBRegressor' object has no attribute 'get_shap_age'

In [31]:
example.loc['test', :] = [50, 100, 38, 0.75, 3, 96, 1, 4, 0, 215, 12, 1, 2, 5, 7, 60, 160, 240, 4, 1]

In [32]:
model.predict(example)

array([1.7734985 , 0.55312353], dtype=float32)

In [33]:
example

Unnamed: 0,Demographics_Age,Examination_BPSystolic2,Examination_ArmCircum,Demographics_IncomeRatio,Questionnaire_GeneralHealth,Questionnaire_JobMonths,Demographics_Gender_2.0,Demographics_Education,Questionnaire_SpecialHealthCareEquipment_2.0,Questionnaire_SelfReportedGreatestWeight,Questionnaire_AlcoholFreqDays,Questionnaire_100Cigarettes_2.0,Questionnaire_ShortnessOfBreath_2.0,Demographics_MaritalStatus_2.0,Questionnaire_RoomsInHome,Examination_BPDiastolic3,Questionnaire_SelfReportedWeightAge25,Questionnaire_LongestJobDuration,Demographics_RaceEthnicity_3.0,Demographics_Citizenship_2.0
76119.0,49.0,100.0,1.0,0.75,3.0,96.0,1.0,4.0,0.0,215.0,12.0,1.0,2.0,5.0,7.0,62.0,160.0,240.0,4.0,1.0
test,50.0,100.0,38.0,0.75,3.0,96.0,1.0,4.0,0.0,215.0,12.0,1.0,2.0,5.0,7.0,60.0,160.0,240.0,4.0,1.0


In [34]:
model.

XGBRegressor(learning_rate=0.01, max_depth=5, missing=nan, n_estimators=10000,
             objective='survival:cox', subsample=0.5)