In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd 
import numpy as np 
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [15]:
X_train = pd.read_csv("../data/training_set_features.csv")
y_train = pd.read_csv("../data/training_set_labels.csv")
X_test = pd.read_csv("../data/test_set_features.csv")
X_train.shape

(26707, 36)

## First looks to data

In [None]:
print(X_train.head(3))
print(X_train.describe())
print(y_train.head(3))
print(y_train.describe())

In [None]:
# percentage of h1n1_vaccine and of seasonal_vaccine -> could have been seen also by the mean

print( f"percentage h1n1_vaccine:   {y_train['h1n1_vaccine'].sum()/len(y_train)*100}")
print( f"percentage seasonal_vaccine:   {y_train['seasonal_vaccine'].sum()/len(y_train)*100}")

In [None]:
# take a look to the distribution of missing values 
X_train.isna().sum()

## Preprocessing 

In [22]:
# I need to apply the preprocessing to both X_training and X_test, I create then a sklearn pipeline

In [29]:
# It seems that nans in some categories as emlpoyment industry and employment occuapations may be seen as a category itself ("unemployed")
# I would rather not delete those observations that contains nans and I ll try to input instead a sensible values - (is risky but normally people doesnt want to share bad information)
#There are not too many columns so i will apply the change column by column, i would keep some of them as nan and try to input the values later
def fill_missing_values_with_adhoc_values(df : pd.DataFrame) -> pd.DataFrame:
    df["employment_occupation"] = df["employment_occupation"].fillna("unemployed")
    df["employment_industry"] = df["employment_industry"].fillna("unemployed")
    df["education"] = df["education"].fillna("< 12 Years")
    df["health_insurance"] = df["health_insurance"].fillna(0)
    df["income_poverty"] = df["income_poverty"].fillna("Below Poverty")
    df["employment_status"] = df["employment_status"].fillna("Unemployed")
    return df 

In [30]:
# I m gonna fill the other missing values on the one hot encoded nana columns with KNNImputer, but first I need to convert the columns to one hot encoded 
# if they dont have an order or scale them to integer 

def convert_columns_to_one_hot_encoded(df : pd.DataFrame) -> pd.DataFrame:
    categorical_columns = ["race", "sex","marital_status","rent_or_own","employment_status","hhs_geo_region","census_msa", "employment_industry", "employment_occupation"]
    encoder = OneHotEncoder(sparse_output=False, drop="if_binary")
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    df = pd.concat([df, pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)
    df = df.drop(categorical_columns, axis=1)

    return df 

In [31]:
# Lets convert the categorical variables ordered variables to numeric variables 

def map_categorical_ordered_variables(df : pd.DataFrame) -> pd.DataFrame:
    category_mapping = {'Below Poverty': 1, '<= $75,000, Above Poverty': 2, '> $75,000': 3}
    df['income_poverty'] = df['income_poverty'].map(category_mapping)
    
    category_mapping = {'< 12 Years': 1, '12 Years': 2, 'Some College': 3, 'College Graduate': 4}
    df['education'] = df['education'].map(category_mapping)
    
    category_mapping = {'18 - 34 Years': 1, '35 - 44 Years': 2, '45 - 54 Years': 3, '55 - 64 Years' : 4, '65+ Years' : 5 }
    df['age_group'] = df['age_group'].map(category_mapping)
    return df

In [32]:
# Lets imput missing values, i could use both the dataframe togheter for that but they should be big enough to make it robust
def imputing_missing_values(df : pd.DataFrame) -> pd.DataFrame:
    knn_imputer = KNNImputer(n_neighbors=10)
    imputed_data = knn_imputer.fit_transform(df)
    return df 

In [33]:
preprocessing_pipeline = Pipeline(
    steps=[("fill_known_missing_values", FunctionTransformer(fill_missing_values_with_adhoc_values)),
           ("convert_columns_to_one_hot_encoded", FunctionTransformer(convert_columns_to_one_hot_encoded)),
           ("map_categorical_ordered_variables", FunctionTransformer(map_categorical_ordered_variables)),
           ("impute_missing_values", FunctionTransformer(imputing_missing_values)),
          ]
)

In [34]:
X_train = preprocessing_pipeline.fit_transform(X_train)
X_test = preprocessing_pipeline.fit_transform(X_test)

In [35]:
# X_test is just for the submission, to avoid overfitting I split the training set to train and validation
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Start training and get the first predictions