In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd 
import numpy as np 
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [45]:
X_train = pd.read_csv("../data/training_set_features.csv")
y_train = pd.read_csv("../data/training_set_labels.csv")
X_test = pd.read_csv("../data/test_set_features.csv")


## First looks to data

In [None]:
print(X_train.shape)

In [None]:
print(X_train.head(3))
print(X_train.describe())
print(y_train.head(3))
print(y_train.describe())

In [None]:
# percentage of h1n1_vaccine and of seasonal_vaccine -> could have been seen also by the mean

print( f"percentage h1n1_vaccine:   {y_train['h1n1_vaccine'].sum()/len(y_train)*100}")
print( f"percentage seasonal_vaccine:   {y_train['seasonal_vaccine'].sum()/len(y_train)*100}")

In [None]:
# take a look to the distribution of missing values 
X_train.isna().sum()

## Preprocessing 

In [22]:
# I need to apply the preprocessing to both X_training and X_test, I create then a sklearn pipeline

In [46]:
# It seems that nans in some categories as emlpoyment industry and employment occuapations may be seen as a category itself ("unemployed")
# I would rather not delete those observations that contains nans and I ll try to input instead a sensible values - (is risky but normally people doesnt want to share bad information)
#There are not too many columns so i will apply the change column by column, i would keep some of them as nan and try to input the values later
def fill_missing_values_with_adhoc_values(df : pd.DataFrame) -> pd.DataFrame:
    df["employment_occupation"] = df["employment_occupation"].fillna("unemployed")
    df["employment_industry"] = df["employment_industry"].fillna("unemployed")
    df["education"] = df["education"].fillna("< 12 Years")
    df["health_insurance"] = df["health_insurance"].fillna(0)
    df["income_poverty"] = df["income_poverty"].fillna("Below Poverty")
    df["employment_status"] = df["employment_status"].fillna("Unemployed")
    return df 

In [47]:
# I m gonna fill the other missing values on the one hot encoded nana columns with KNNImputer, but first I need to convert the columns to one hot encoded 
# if they dont have an order or scale them to integer 

def convert_columns_to_one_hot_encoded(df : pd.DataFrame) -> pd.DataFrame:
    categorical_columns = ["race", "sex","marital_status","rent_or_own","employment_status","hhs_geo_region","census_msa", "employment_industry", "employment_occupation"]
    encoder = OneHotEncoder(sparse_output=False, drop="if_binary")
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    df = pd.concat([df, pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)
    df = df.drop(categorical_columns, axis=1)

    return df 

In [48]:
# Lets convert the categorical variables ordered variables to numeric variables 

def map_categorical_ordered_variables(df : pd.DataFrame) -> pd.DataFrame:
    category_mapping = {'Below Poverty': 1, '<= $75,000, Above Poverty': 2, '> $75,000': 3}
    df['income_poverty'] = df['income_poverty'].map(category_mapping)
    
    category_mapping = {'< 12 Years': 1, '12 Years': 2, 'Some College': 3, 'College Graduate': 4}
    df['education'] = df['education'].map(category_mapping)
    
    category_mapping = {'18 - 34 Years': 1, '35 - 44 Years': 2, '45 - 54 Years': 3, '55 - 64 Years' : 4, '65+ Years' : 5 }
    df['age_group'] = df['age_group'].map(category_mapping)
    return df

In [49]:
# Lets imput missing values, i could use both the dataframe togheter for that but they should be big enough to make it robust
def imputing_missing_values(df : pd.DataFrame) -> pd.DataFrame:
    knn_imputer = KNNImputer(n_neighbors=10)
    imputed_data = knn_imputer.fit_transform(df)
    return imputed_data

In [50]:
def drop_useless_columns(df : pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=["respondent_id"])

In [51]:
preprocessing_pipeline = Pipeline(
    steps=[("fill_known_missing_values", FunctionTransformer(fill_missing_values_with_adhoc_values)),
           ("convert_columns_to_one_hot_encoded", FunctionTransformer(convert_columns_to_one_hot_encoded)),
           ("map_categorical_ordered_variables", FunctionTransformer(map_categorical_ordered_variables)),
           ("drop_useless_columns", FunctionTransformer(drop_useless_columns)),
           ("impute_missing_values", FunctionTransformer(imputing_missing_values)),    
          ]
)

In [52]:
X_train = preprocessing_pipeline.fit_transform(X_train)
X_test = preprocessing_pipeline.fit_transform(X_test)

In [53]:
# X_test is just for the submission, to avoid overfitting I split the training set to train and validation
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Start training and get the first predictions

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler

In [55]:
# get the prediction with a logistic regression with imbalanced classes to have a baseline 

In [102]:
logreg_model_h1n1 = LogisticRegression(max_iter = 1000)
logreg_model_h1n1.fit(X_train, y_train[["h1n1_vaccine"]])
y_pred_h1h1 = logreg_model_h1n1.predict(X_validation)
y_pred_h1h1_probs = logreg_model_h1n1.predict_proba(X_validation)[:, 1]


logreg_model_seasonal = LogisticRegression(max_iter = 1000)
logreg_model_seasonal.fit(X_train, y_train[["seasonal_vaccine"]])
y_pred_seasonal = logreg_model_seasonal.predict(X_validation)
y_pred_seasonal_probs = logreg_model_seasonal.predict_proba(X_validation)[:, 1]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [103]:
h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1)
seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal)
print(f"baseline h1n1 score:   {h1n1_score}")
print(f"baseline seasonal score:      {seasonal_score}")

baseline h1n1 score:   0.7190492398457
baseline seasonal score:      0.7821854173696531


In [104]:
h1n1_score_probs = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1_probs)
seasonal_score_probs = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal_probs)
print(f"baseline h1n1 score with probabilities:   {h1n1_score_probs}")
print(f"baseline seasonal score with probabilities:      {seasonal_score_probs}") 

baseline h1n1 score with probabilities:   0.8603786904671862
baseline seasonal score with probabilities:      0.8565199247344104


In [63]:
# try again with undersample data

In [105]:
under_sampler = RandomUnderSampler(random_state=42)
X_resampled_h1n1, y_resampled_h1n1 = under_sampler.fit_resample(X_train, y_train["h1n1_vaccine"])
X_resampled_seasonal, y_resampled_seasonal = under_sampler.fit_resample(X_train, y_train["seasonal_vaccine"])


In [106]:
# When using probabilities it s not stricktly necessary to resample datas 

logreg_model_h1n1 = LogisticRegression(max_iter = 1000)
logreg_model_h1n1.fit(X_resampled_h1n1, y_resampled_h1n1)
y_pred_h1h1 = logreg_model_h1n1.predict(X_validation)
y_pred_h1h1_probs = logreg_model_h1n1.predict_proba(X_validation)[:, 1]

logreg_model_seasonal = LogisticRegression(max_iter = 1000)
logreg_model_seasonal.fit(X_resampled_seasonal, y_resampled_seasonal)
y_pred_seasonal = logreg_model_seasonal.predict(X_validation)
y_pred_seasonal_probs = logreg_model_seasonal.predict_proba(X_validation)[:, 1]

In [107]:
h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1)
seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal)
print(f"baseline h1n1 score:   {h1n1_score}")
print(f"baseline seasonal score:      {seasonal_score}")

baseline h1n1 score:   0.78393485952483
baseline seasonal score:      0.7871126377235955


In [108]:
h1n1_score = roc_auc_score(y_validation["h1n1_vaccine"] ,y_pred_h1h1_probs)
seasonal_score = roc_auc_score(y_validation["seasonal_vaccine"] ,y_pred_seasonal_probs)
print(f"baseline h1n1 score with probabilities:   {h1n1_score}")
print(f"baseline seasonal score with probabilities:      {seasonal_score}")

baseline h1n1 score with probabilities:   0.8597204363428551
baseline seasonal score with probabilities:      0.8563501495447046


## Proceed with more advanced models 