In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# Load datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Merge train features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Identify feature types
categorical_cols = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 
                    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 
                    'employment_industry', 'employment_occupation']

numeric_cols = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
                'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
                'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']

binary_cols = ['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask',
               'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
               'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition',
               'child_under_6_months', 'health_worker', 'health_insurance']

# Ensure all categorical features are in the correct format
train_features[categorical_features] = train_features[categorical_features].astype(str)
test_features[categorical_features] = test_features[categorical_features].astype(str)

# Preprocessing pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Preprocessing pipeline for binary features
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('bin', binary_transformer, binary_cols)])

# Model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', MultiOutputClassifier(LogisticRegression(max_iter= 2000)))])

# Split the data
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred = model.predict_proba(X_val)
xyz_vaccine_pred = y_pred[0][:, 1]
seasonal_vaccine_pred = y_pred[1][:, 1]

# Evaluate the model
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], xyz_vaccine_pred)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], seasonal_vaccine_pred)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'Mean ROC AUC: {mean_roc_auc:.4f}')

# Make predictions on test data
test_pred = model.predict_proba(test_features.drop(columns=['respondent_id']))
xyz_vaccine_test_pred = test_pred[0][:, 1]
seasonal_vaccine_test_pred = test_pred[1][:, 1]

# Prepare submission file
submission = test_features[['respondent_id']].copy()
submission['xyz_vaccine'] = xyz_vaccine_test_pred
submission['seasonal_vaccine'] = seasonal_vaccine_test_pred
submission.to_csv('submission_datahack.csv', index=False)

Mean ROC AUC: 0.8437


In [9]:
roc_auc_xyz

0.8313409222701258

In [10]:
roc_auc_seasonal

0.8560597111902455