In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd 
import numpy as np 
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
X_train = pd.read_csv("../data/training_set_features.csv")
y_train = pd.read_csv("../data/training_set_labels.csv")
X_test = pd.read_csv("../data/test_set_features.csv")


## First looks to data

In [3]:
print(X_train.shape)

(26707, 36)


In [4]:
print(X_train.head(3))
print(X_train.describe())
print(y_train.head(3))
print(y_train.describe())

   respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
0              0           1.0             0.0                        0.0   
1              1           3.0             2.0                        0.0   
2              2           1.0             1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0   
1                          0.0                      1.0   
2                          0.0                      0.0   

   behavioral_touch_face  ...             income_poverty  marital_status  \
0                    1.0  ...              Below Poverty     Not Married   
1            

In [5]:
# percentage of h1n1_vaccine and of seasonal_vaccine -> could have been seen also by the mean

print( f"percentage h1n1_vaccine:   {y_train['h1n1_vaccine'].sum()/len(y_train)*100}")
print( f"percentage seasonal_vaccine:   {y_train['seasonal_vaccine'].sum()/len(y_train)*100}")

percentage h1n1_vaccine:   21.24536638334519
percentage seasonal_vaccine:   46.56082674954132


In [6]:
# take a look to the distribution of missing values 
X_train.isna().sum()

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

## Preprocessing 

In [7]:
# I need to apply the preprocessing to both X_training and X_test, I create then a sklearn pipeline

In [8]:
# It seems that nans in some categories as emlpoyment industry and employment occuapations may be seen as a category itself ("unemployed")
# I would rather not delete those observations that contains nans and I ll try to input instead a sensible values - (is risky but normally people doesnt want to share bad information)
#There are not too many columns so i will apply the change column by column, i would keep some of them as nan and try to input the values later
def fill_missing_values_with_adhoc_values(df : pd.DataFrame) -> pd.DataFrame:
    df["employment_occupation"] = df["employment_occupation"].fillna("unemployed")
    df["employment_industry"] = df["employment_industry"].fillna("unemployed")
    df["education"] = df["education"].fillna("< 12 Years")
    df["health_insurance"] = df["health_insurance"].fillna(0)
    df["income_poverty"] = df["income_poverty"].fillna("Below Poverty")
    df["employment_status"] = df["employment_status"].fillna("Unemployed")
    return df 

In [9]:
# I m gonna fill the other missing values on the one hot encoded nana columns with KNNImputer, but first I need to convert the columns to one hot encoded 
# if they dont have an order or scale them to integer 

def convert_columns_to_one_hot_encoded(df : pd.DataFrame) -> pd.DataFrame:
    categorical_columns = ["race", "sex","marital_status","rent_or_own","employment_status","hhs_geo_region","census_msa", "employment_industry", "employment_occupation"]
    encoder = OneHotEncoder(sparse_output=False, drop="if_binary")
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    df = pd.concat([df, pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)
    df = df.drop(categorical_columns, axis=1)

    return df 

In [10]:
# Lets convert the categorical variables ordered variables to numeric variables 

def map_categorical_ordered_variables(df : pd.DataFrame) -> pd.DataFrame:
    category_mapping = {'Below Poverty': 1, '<= $75,000, Above Poverty': 2, '> $75,000': 3}
    df['income_poverty'] = df['income_poverty'].map(category_mapping)
    
    category_mapping = {'< 12 Years': 1, '12 Years': 2, 'Some College': 3, 'College Graduate': 4}
    df['education'] = df['education'].map(category_mapping)
    
    category_mapping = {'18 - 34 Years': 1, '35 - 44 Years': 2, '45 - 54 Years': 3, '55 - 64 Years' : 4, '65+ Years' : 5 }
    df['age_group'] = df['age_group'].map(category_mapping)
    return df

In [11]:
# Lets imput missing values, i could use both the dataframe togheter for that but they should be big enough to make it robust
def imputing_missing_values(df : pd.DataFrame) -> pd.DataFrame:
    knn_imputer = KNNImputer(n_neighbors=10)
    imputed_data = knn_imputer.fit_transform(df)
    return imputed_data

In [12]:
def drop_useless_columns(df : pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=["respondent_id"])

In [13]:
preprocessing_pipeline = Pipeline(
    steps=[("fill_known_missing_values", FunctionTransformer(fill_missing_values_with_adhoc_values)),
           ("convert_columns_to_one_hot_encoded", FunctionTransformer(convert_columns_to_one_hot_encoded)),
           ("map_categorical_ordered_variables", FunctionTransformer(map_categorical_ordered_variables)),
           ("drop_useless_columns", FunctionTransformer(drop_useless_columns)),
           ("impute_missing_values", FunctionTransformer(imputing_missing_values)),    
          ]
)

In [14]:
X_train = preprocessing_pipeline.fit_transform(X_train)
X_test = preprocessing_pipeline.fit_transform(X_test)

In [15]:
# X_test is just for the submission, to avoid overfitting I split the training set to train and validation
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Start training and get the first predictions

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler

In [17]:
# get the prediction with a logistic regression with imbalanced classes to have a baseline 

In [18]:
logreg_model_h1n1 = LogisticRegression(max_iter = 1000)
logreg_model_h1n1.fit(X_train, y_train[["h1n1_vaccine"]])
y_pred_h1h1 = logreg_model_h1n1.predict(X_validation)
y_pred_h1h1_probs = logreg_model_h1n1.predict_proba(X_validation)[:, 1]


logreg_model_seasonal = LogisticRegression(max_iter = 1000)
logreg_model_seasonal.fit(X_train, y_train[["seasonal_vaccine"]])
y_pred_seasonal = logreg_model_seasonal.predict(X_validation)
y_pred_seasonal_probs = logreg_model_seasonal.predict_proba(X_validation)[:, 1]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1)
seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal)
print(f"baseline h1n1 score:   {h1n1_score}")
print(f"baseline seasonal score:      {seasonal_score}")

h1n1_score_probs = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1_probs)
seasonal_score_probs = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal_probs)
print(f"baseline h1n1 score with probabilities:   {h1n1_score_probs}")
print(f"baseline seasonal score with probabilities:      {seasonal_score_probs}") 

baseline h1n1 score:   0.7190492398457
baseline seasonal score:      0.7821854173696531
baseline h1n1 score with probabilities:   0.8603786904671862
baseline seasonal score with probabilities:      0.8565199247344104


In [20]:
# try again with undersample data

In [21]:
under_sampler = RandomUnderSampler(random_state=42)
X_resampled_h1n1, y_resampled_h1n1 = under_sampler.fit_resample(X_train, y_train["h1n1_vaccine"])
X_resampled_seasonal, y_resampled_seasonal = under_sampler.fit_resample(X_train, y_train["seasonal_vaccine"])


In [22]:
# When using probabilities it s not stricktly necessary to resample datas 

logreg_model_h1n1 = LogisticRegression(max_iter = 1000)
logreg_model_h1n1.fit(X_resampled_h1n1, y_resampled_h1n1)
y_pred_h1h1 = logreg_model_h1n1.predict(X_validation)
y_pred_h1h1_probs = logreg_model_h1n1.predict_proba(X_validation)[:, 1]

logreg_model_seasonal = LogisticRegression(max_iter = 1000)
logreg_model_seasonal.fit(X_resampled_seasonal, y_resampled_seasonal)
y_pred_seasonal = logreg_model_seasonal.predict(X_validation)
y_pred_seasonal_probs = logreg_model_seasonal.predict_proba(X_validation)[:, 1]

In [23]:
h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1)
seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal)
print(f"baseline h1n1 score:   {h1n1_score}")
print(f"baseline seasonal score:      {seasonal_score}")

baseline h1n1 score:   0.78393485952483
baseline seasonal score:      0.7871126377235955


In [24]:
h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1_probs)
seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal_probs)
print(f"baseline h1n1 score with probabilities:   {h1n1_score}")
print(f"baseline seasonal score with probabilities:      {seasonal_score}")

baseline h1n1 score with probabilities:   0.8597204363428551
baseline seasonal score with probabilities:      0.8563501495447046


## Proceed with more advanced models 

### Xgboost with default params 

In [25]:
import xgboost as xgb
from skopt import BayesSearchCV

In [26]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc') # I use AUC as metrix because is the competition metric

xgb_classifier.fit(X_train, y_train[["h1n1_vaccine"]])
xgb_y_pred_h1n1 = xgb_classifier.predict(X_validation)
xgb_y_pred_h1n1_probs = xgb_classifier.predict_proba(X_validation)[:, 1]

xgb_classifier.fit(X_train, y_train[["seasonal_vaccine"]])
xgb_y_pred_seasonal = xgb_classifier.predict(X_validation)
xgb_y_pred_seasonal_probs = xgb_classifier.predict_proba(X_validation)[:, 1]


In [27]:
xgb_h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,xgb_y_pred_h1n1)
xgb_seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,xgb_y_pred_seasonal)
print(f"baseline h1n1 score:   {xgb_h1n1_score}")
print(f"baseline seasonal score:      {xgb_seasonal_score}")

xgb_h1n1_score_probs = roc_auc_score(y_validation["h1n1_vaccine"] ,xgb_y_pred_h1n1_probs)
xgb_seasonal_score_probs = roc_auc_score(y_validation["seasonal_vaccine"] ,xgb_y_pred_seasonal_probs)
print(f"baseline h1n1 score with probabilities:   {xgb_h1n1_score_probs}")
print(f"baseline seasonal score with probabilities:      {xgb_seasonal_score_probs}") 

baseline h1n1 score:   0.7320540133961964
baseline seasonal score:      0.7829836994648907
baseline h1n1 score with probabilities:   0.8590554589079663
baseline seasonal score with probabilities:      0.8583283903773737


In [28]:
# results are slightly better than the baseline, let s optimize the HP for the xgboost 

In [29]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc')

param_space = {
    'learning_rate': (0.01, 0.2, 'log-uniform'),
    'n_estimators': (50, 150),
    'max_depth': (3, 7),
    'subsample': (0.8, 1.0),
    'colsample_bytree': (0.8, 1.0),
}

bayes_search = BayesSearchCV(
    estimator=xgb_classifier,
    search_spaces=param_space,
    scoring='roc_auc',
    cv=5,
    n_iter=50,  # Number of iterations (adjust as needed)
    n_jobs=-1
)

np.int = int # to solve some error with numpy version in the skopt library

# Fit the Bayesian search to the data
bayes_search.fit(X_train, y_train[["h1n1_vaccine"]])
best_params_h1n1 = bayes_search.best_params_
print("Best Hyperparameters h1n1:", best_params_h1n1)

bayes_search.fit(X_train, y_train[["seasonal_vaccine"]])
best_params_seasonal = bayes_search.best_params_
print("Best Hyperparameters seasonal:", best_params_seasonal)


Best Hyperparameters h1n1: OrderedDict([('colsample_bytree', 0.8925589692663731), ('learning_rate', 0.05822186589792438), ('max_depth', 5), ('n_estimators', 137), ('subsample', 0.8)])
Best Hyperparameters seasonal: OrderedDict([('colsample_bytree', 0.8), ('learning_rate', 0.08440619119260946), ('max_depth', 4), ('n_estimators', 150), ('subsample', 0.8)])


In [30]:
# I use those params to train and see if the predictions are more accurate

In [31]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic',
                                   eval_metric='auc',
                                   colsample_bytree = 0.8,
                                   learning_rate = 0.0654,
                                   max_depth = 4,
                                   n_estimators = 150,
                                   subsample = 0.8                               
                                  ) # I use AUC as metrix because is the competition metric

xgb_classifier.fit(X_train, y_train[["h1n1_vaccine"]])
xgb_y_pred_h1n1_probs = xgb_classifier.predict_proba(X_validation)[:, 1]

xgb_h1n1_score_probs = roc_auc_score(y_validation["h1n1_vaccine"] ,xgb_y_pred_h1n1_probs)
print(f"baseline h1n1 score with probabilities:   {xgb_h1n1_score_probs}")


baseline h1n1 score with probabilities:   0.8733506878787116


In [32]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic',
                                   eval_metric='auc',
                                   colsample_bytree = 0.8,
                                   learning_rate = 0.0559,
                                   max_depth = 5,
                                   n_estimators = 150,
                                   subsample = 0.8                               
                                  ) # I use AUC as metrix because is the competition metric

xgb_classifier.fit(X_train, y_train[["seasonal_vaccine"]])
xgb_y_pred_seasonal_probs = xgb_classifier.predict_proba(X_validation)[:, 1]
xgb_seasonal_score_probs = roc_auc_score(y_validation["seasonal_vaccine"] ,xgb_y_pred_seasonal_probs)
print(f"baseline seasonal score with probabilities:      {xgb_seasonal_score_probs}") 



baseline seasonal score with probabilities:      0.8675132563657582


#### The results looks promising and if confirmed on the test set they would bring me to the first place of the partial leaderboard among more than 6000 candidates

### KNeighborsClassifier

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer

#### Instead of Bayesian optimization this time we run 5 fold CV 

In [58]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': list(range(3, 20)),  
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # try different distances: 1 for Manhattan distance, 2 for Euclidean distance
}

scorer = make_scorer(roc_auc_score)
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring=scorer)
grid_search.fit(X_train, y_train["h1n1_vaccine"])

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters h1n1_vaccine:", best_params)

grid_search.fit(X_train, y_train["seasonal_vaccine"])
best_params = grid_search.best_params_
print("Best Hyperparameters seasonal_vaccine:", best_params)




KeyboardInterrupt



In [61]:
knn = KNeighborsClassifier(n_neighbors=7, p = 1, weights = "distance")
knn.fit(X_train, y_train["h1n1_vaccine"])
knn_h1n1_predictions = knn.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation["h1n1_vaccine"], knn_h1n1_predictions)

0.7769285396129053

In [62]:
knn = KNeighborsClassifier(n_neighbors=19, p = 1, weights = "uniform")
knn.fit(X_train, y_train["seasonal_vaccine"])
knn_seasonal_predictions = knn.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation["seasonal_vaccine"], knn_seasonal_predictions)

0.833150856757864