In [31]:
import pandas as pd

# Load the data from the CSV files

# First, let's get the features
features = pd.read_csv('/content/drive/MyDrive/dataset and all/training_set_features.csv')

# Now, let's load the target labels
labels = pd.read_csv('/content/drive/MyDrive/dataset and all/training_set_labels.csv')

# Merge the features and labels based on 'respondent_id'
data = features.merge(labels, on='respondent_id')

data

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,xyz_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0,0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0,1
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0,0


In [32]:
# Separate the features (X) from the targets (y)
# We don't need 'respondent_id', 'xyz_vaccine', and 'seasonal_vaccine' in X
X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify which features are numerical and which are categorical
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()



In [33]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# For numerical data, we'll fill in missing values with the median and then scale the data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# For categorical data, we'll fill in missing values with the most frequent value and then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the numerical and categorical transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [34]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

# Define the model as a multi-output classifier using a random forest
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Create a pipeline that first preprocesses the data, then fits the model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

In [35]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
clf.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_prob = clf.predict_proba(X_test)

# Extract the probabilities for each target variable
xyz_vaccine_pred_prob = y_pred_prob[0][:, 1]
seasonal_vaccine_pred_prob = y_pred_prob[1][:, 1]

In [36]:
from sklearn.metrics import roc_auc_score

# Calculate the ROC AUC scores for both targets
xyz_vaccine_roc_auc = roc_auc_score(y_test['xyz_vaccine'], xyz_vaccine_pred_prob)
seasonal_vaccine_roc_auc = roc_auc_score(y_test['seasonal_vaccine'], seasonal_vaccine_pred_prob)

# Calculate the mean ROC AUC score
mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2

# Print the results
print(f'XYZ Vaccine ROC AUC: {xyz_vaccine_roc_auc}')
print(f'Seasonal Vaccine ROC AUC: {seasonal_vaccine_roc_auc}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Predict probabilities on the full dataset
submission_pred_prob = clf.predict_proba(X)

# Extract the probabilities for the submission file
xyz_vaccine_submission_prob = submission_pred_prob[0][:, 1]
seasonal_vaccine_submission_prob = submission_pred_prob[1][:, 1]

# Create the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': data['respondent_id'],
    'xyz_vaccine': xyz_vaccine_submission_prob,
    'seasonal_vaccine': seasonal_vaccine_submission_prob
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)


XYZ Vaccine ROC AUC: 0.8294325525888947
Seasonal Vaccine ROC AUC: 0.8518072872366175
Mean ROC AUC: 0.8406199199127561
