Importing required libraries

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Loading Datasets

In [52]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [32]:
X = train_features.drop(columns=['respondent_id'])
y = train_labels.drop(columns=['respondent_id'])

Preprocessing for numeric and categorical data

In [33]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

Pipeline creation

In [34]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

Splitting dataset

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Fit the model

In [36]:
model.fit(X_train, y_train)

Validating the model using cv dataset

In [38]:
y_val_pred = model.predict_proba(X_val)
y_val_pred = np.column_stack([pred[:, 1] for pred in y_val_pred])

Evaluate the model using ROC AUC score

In [39]:
roc_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred[:, 0])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred[:, 1])
roc_auc_mean = (roc_auc + roc_auc_seasonal) / 2

In [42]:
print(roc_auc_mean)

0.8410126143691476


Test data

In [43]:
test_processed = test_features.drop(columns=['respondent_id'])
test_prob = model.predict_proba(test_processed)
test_prob = np.column_stack([pred[:, 1] for pred in test_prob])

In [44]:
print(test_prob)

[[0.25 0.3 ]
 [0.06 0.03]
 [0.49 0.81]
 ...
 [0.15 0.37]
 [0.09 0.33]
 [0.45 0.56]]


Submission file

In [49]:
submission_df = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_prob[:, 0],
    'seasonal_vaccine': test_prob[:, 1]
})
submission_df.to_csv('submission.csv', index=False)
print(submission_df.head())

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707         0.25              0.30
1          26708         0.06              0.03
2          26709         0.49              0.81
3          26710         0.44              0.89
4          26711         0.23              0.49
