In [1]:
from google.colab import files
uploaded = files.upload()

Saving test_set_features.csv to test_set_features.csv
Saving training_set_features.csv to training_set_features.csv
Saving training_set_labels.csv to training_set_labels.csv
Saving submission_format.csv to submission_format.csv


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
import nltk

# Ensure nltk stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# Merge training features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

# Identify feature columns
binary_features = [col for col in train_features.columns if train_features[col].nunique() == 2 and col != 'respondent_id']
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own',
                        'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
numerical_features = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc',
                      'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', MultiOutputClassifier(model))])

# Separate features and targets
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Split data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Get predictions
y_pred = clf.predict_proba(X_valid)

# Convert predicted probabilities to the correct format for ROC AUC calculation
y_pred_xyz = np.array([pred[1] for pred in y_pred[0]])
y_pred_seasonal = np.array([pred[1] for pred in y_pred[1]])

# Calculate ROC AUC for each label using average='macro'
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_xyz, average='macro')
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_seasonal, average='macro')
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Make predictions on test set
test_preds = clf.predict_proba(test_features.drop(columns=['respondent_id']))
test_preds_xyz = [pred[1] for pred in test_preds[0]]
test_preds_seasonal = [pred[1] for pred in test_preds[1]]

# Prepare the submission file
submission_format['xyz_vaccine'] = test_preds_xyz
submission_format['seasonal_vaccine'] = test_preds_seasonal
submission_format.to_csv('submission_nandiniruhela.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_nandiniruhela.csv')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ROC AUC for xyz_vaccine: 0.7813549992016069
ROC AUC for seasonal_vaccine: 0.8246298498653865
Mean ROC AUC: 0.8029924245334967


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>