In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


In [2]:

train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

print("Training Features:")
print(train_features.head())
print("\nTraining Labels:")
print(train_labels.head())
print("\nTest Features:")
print(test_features.head())
print("\nSubmission Format:")
print(submission_format.head())


Training Features:
   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0        

In [3]:

train_features = train_features.drop('respondent_id', axis=1)
test_features_ids = test_features['respondent_id']
test_features = test_features.drop('respondent_id', axis=1)

print(train_features.isnull().sum())
print(test_features.isnull().sum())


train_features = train_features.fillna(train_features.mean())
test_features = test_features.fillna(test_features.mean())


train_features = pd.get_dummies(train_features, drop_first=True)
test_features = pd.get_dummies(test_features, drop_first=True)

train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

X = train_features
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_features_scaled = scaler.transform(test_features)


xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

  train_features = train_features.fillna(train_features.mean())
  test_features = test_features.fillna(test_features.mean())


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [5]:

rf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_rf = MultiOutputClassifier(rf, n_jobs=-1)

multi_target_rf.fit(X_train, y_train)


In [6]:

y_val_pred_proba = multi_target_rf.predict_proba(X_val)

y_val_pred_proba_xyz = y_val_pred_proba[0][:, 1]
y_val_pred_proba_seasonal = y_val_pred_proba[1][:, 1]

roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba_seasonal)


print(f'ROC AUC score for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC score for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC score: {(roc_auc_xyz + roc_auc_seasonal) / 2}')


ROC AUC score for xyz_vaccine: 0.8612691089092269
ROC AUC score for seasonal_vaccine: 0.8550798698418438
Mean ROC AUC score: 0.8581744893755354


In [7]:

multi_target_rf.fit(X_scaled, y)


In [10]:

y_test_pred_proba = multi_target_rf.predict_proba(test_features_scaled)

y_test_pred_proba_xyz = y_test_pred_proba[0][:, 1]
y_test_pred_proba_seasonal = y_test_pred_proba[1][:, 1]

submission = pd.DataFrame({
    'respondent_id': test_features_ids,
    'h1n1_vaccine': y_test_pred_proba_xyz,
    'seasonal_vaccine': y_test_pred_proba_seasonal
})

submission.to_csv('sub.csv', index=False)
