In [2]:
!pip install dalex

Collecting dalex
  Downloading dalex-1.7.0.tar.gz (1.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m0.8/1.0 MB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: dalex
  Building wheel for dalex (setup.py) ... [?25l[?25hdone
  Created wheel for dalex: filename=dalex-1.7.0-py3-none-any.whl size=1042471 sha256=5ed2ddbdc6c211687935886fd735f1d78c227862da5cf5a018b9d4475f03ef76
  Stored in directory: /root/.cache/pip/wheels/e2/38/c1/25a95206a4873a287d776fc8e77aa7d93971acc643ecb3db38
Successfully bu

In [26]:
# Import required libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import dalex as dx
import joblib

In [3]:
# Load datasets
features = pd.read_csv('/content/drive/MyDrive/training_set_features.csv')
labels = pd.read_csv('/content/drive/MyDrive/training_set_labels.csv')

# Data Preparation
X = features.drop(['respondent_id'], axis=1)
# Transform labels to a single binary class using logical OR
labels['combined_vaccine'] = labels.apply(lambda row: 1 if row['h1n1_vaccine'] == 1 or row['seasonal_vaccine'] == 1 else 0, axis=1)
y = labels['combined_vaccine']

In [5]:
y

0        0
1        1
2        0
3        1
4        0
        ..
26702    0
26703    0
26704    1
26705    0
26706    0
Name: combined_vaccine, Length: 26707, dtype: int64

In [7]:
X.describe()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,24547.0,...,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0
mean,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,0.220312,...,0.111918,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583
std,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,0.414466,...,0.315271,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [15]:
# Here, we transform the features

# Ensure missing values are filled with a placeholder
X['age_group'].fillna('Unknown', inplace=True)
X['education'].fillna('Unknown', inplace=True)
X['income_poverty'].fillna('Unknown', inplace=True)

# Create mappings
age_group_mapping = {value: idx for idx, value in enumerate(sorted(X['age_group'].unique()))}

education_order = ['Unknown', '< 12 Years', '12 Years', 'Some College', 'College Graduate']
education_mapping = {value: idx for idx, value in enumerate(education_order)}

income_poverty_order = ['Unknown', 'Below Poverty', '<= $75,000, Above Poverty', '> $75,000']
income_poverty_mapping = {value: idx for idx, value in enumerate(income_poverty_order)}

display(age_group_mapping)
display(education_mapping)
display(income_poverty_mapping)

# Apply mappings to the DataFrame
X['age_group_mapped'] = X['age_group'].map(age_group_mapping)
X['education_mapped'] = X['education'].map(education_mapping)
X['income_poverty_mapped'] = X['income_poverty'].map(income_poverty_mapping)

# Drop original categorical columns
X = X.drop(columns=['age_group', 'education', 'income_poverty'])

{'18 - 34 Years': 0,
 '35 - 44 Years': 1,
 '45 - 54 Years': 2,
 '55 - 64 Years': 3,
 '65+ Years': 4}

{'Unknown': 0,
 '< 12 Years': 1,
 '12 Years': 2,
 'Some College': 3,
 'College Graduate': 4}

{'Unknown': 0,
 'Below Poverty': 1,
 '<= $75,000, Above Poverty': 2,
 '> $75,000': 3}

In [34]:
# Define columns for OneHotEncoding
onehot_columns = [
    'race', 'sex', 'marital_status', 'rent_or_own', 'employment_status',
    'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation'
]

# Define a ColumnTransformer to apply OneHotEncoding
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_columns)
    ],
    remainder='passthrough'
)

# Fit and transform the preprocessor on the training data
X_transformed = preprocessor.fit_transform(X)

# Extract the feature names after one-hot encoding
onehot_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(onehot_columns)

# Create a list of all feature names
manual_encoded_features = ['age_group_mapped', 'education_mapped', 'income_poverty_mapped']
passthrough_features = [col for col in X.columns if col not in onehot_columns + manual_encoded_features]
all_feature_names = list(onehot_feature_names) + manual_encoded_features + passthrough_features

# Convert the transformed features back to a DataFrame
X_transformed = pd.DataFrame(X_transformed, columns=all_feature_names)
X_transformed

# Instantiate SimpleImputer with strategy='most_frequent' (or another appropriate strategy)
imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform the imputer on X_transformed
X_imputed = imputer.fit_transform(X_transformed)

# Convert back to DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=X_transformed.columns)
X_imputed

Unnamed: 0,race_Black,race_Hispanic,race_Other or Multiple,race_White,sex_Female,sex_Male,marital_status_Married,marital_status_Not Married,marital_status_nan,rent_or_own_Own,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,2.0,2.0,1.0,2.0,0.0,0.0,3.0,1.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,4.0,4.0,4.0,2.0,4.0,0.0,0.0,1.0,2.0,1.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,4.0,1.0,2.0,2.0,0.0,0.0,4.0,2.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,3.0,5.0,5.0,4.0,1.0,0.0,0.0,4.0,2.0,1.0
4,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,3.0,2.0,3.0,1.0,4.0,1.0,0.0,2.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,5.0,2.0,2.0,0.0,0.0,4.0,3.0,2.0
26703,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,2.0,2.0,5.0,1.0,1.0,1.0,0.0,0.0,4.0,2.0
26704,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,4.0,2.0,5.0,4.0,2.0,0.0,0.0,3.0,3.0,0.0
26705,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0,3.0,2.0


In [28]:
# Initialize models
rf_pipeline = create_pipeline(RandomForestClassifier(random_state=42))
xgb_pipeline = create_pipeline(XGBClassifier(random_state=42))
lr_pipeline = create_pipeline(LogisticRegression(random_state=42))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
df.columns[df.isna().any()]

In [31]:
# Train models
rf_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [33]:
X_transformed.columns[X_transformed.isna().any()]

Index(['age_group_mapped', 'education_mapped', 'income_poverty_mapped',
       'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk'],
      dtype='object')

In [37]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

# Train models
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)

In [38]:
# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return auc, f1, cm

# Evaluations
eval_rf = evaluate_model(rf_model, X_test, y_test)
eval_xgb = evaluate_model(xgb_model, X_test, y_test)
eval_lr = evaluate_model(lr_model, X_test, y_test)

# Print evaluation results
print("Combined Vaccine Prediction - Random Forest: ", eval_rf)
print("Combined Vaccine Prediction - XGBoost: ", eval_xgb)
print("Combined Vaccine Prediction - Logistic Regression: ", eval_lr)

Combined Vaccine Prediction - Random Forest:  (0.8521382329379755, 0.7795320525014267, array([[2134,  551],
       [ 608, 2049]]))
Combined Vaccine Prediction - XGBoost:  (0.8480781940680218, 0.7720364741641337, array([[2110,  575],
       [ 625, 2032]]))
Combined Vaccine Prediction - Logistic Regression:  (0.8430350523440768, 0.7695826186392224, array([[2114,  571],
       [ 638, 2019]]))


In [40]:
# DALEX Interpretability
explainer_rf = dx.Explainer(rf_model, X_test, y_test, label="Random Forest - Combined")
explainer_xgb = dx.Explainer(xgb_model, X_test, y_test, label="XGBoost - Combined")
explainer_lr = dx.Explainer(lr_model, X_test, y_test, label="Logistic Regression - Combined")

Preparation of a new explainer is initiated

  -> data              : 5342 rows 101 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 5342 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Random Forest - Combined
  -> predict function  : <function yhat_proba_default at 0x7fbd75e8cca0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0, mean = 0.499, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.98, mean = -0.00146, max = 0.97
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 5342 rows 101 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Convert

In [47]:
# Save the models using joblib
joblib.dump(rf_model, 'rf_combined_model-1.pkl')
joblib.dump(xgb_model, 'xgb_combined_model.pkl')
joblib.dump(lr_model, 'lr_combined_model.pkl')

['lr_combined_model.pkl']

In [46]:
# Predict labels using each model
rf_predictions = rf_model.predict(X_imputed)
xgb_predictions = xgb_model.predict(X_imputed)
lr_predictions = lr_model.predict(X_imputed)

# Create DataFrames for predictions
rf_df = pd.DataFrame({'rf_predictions': rf_predictions})
xgb_df = pd.DataFrame({'xgb_predictions': xgb_predictions})
lr_df = pd.DataFrame({'lr_predictions': lr_predictions})

# Save predictions to CSV files
rf_df.to_csv('rf_predictions.csv', index=False)
xgb_df.to_csv('xgb_predictions.csv', index=False)
lr_df.to_csv('lr_predictions.csv', index=False)