In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer






In [26]:
# Step 1: Load the data
features = pd.read_csv(r"C:\Users\DIVYANSH RAI\Downloads\dataset and all\training_set_features.csv")
labels = pd.read_csv(r"C:\Users\DIVYANSH RAI\Downloads\dataset and all\training_set_labels.csv")

# Merge the features and labels on respondent_id
data = pd.merge(features, labels, on='respondent_id')

# Separate features and target variables
X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = data['xyz_vaccine']
y_seasonal = data['seasonal_vaccine']





In [27]:
# Step 2: Preprocess the data
# Define numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])





In [28]:
# Define model
model = RandomForestClassifier(random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=0)
_, _, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=0)

In [29]:
# Step 4: Training the model and evaluating it using GridSearchCV for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid_search_xyz = GridSearchCV(clf, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search_xyz.fit(X_train, y_train_xyz)

grid_search_seasonal = GridSearchCV(clf, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search_seasonal.fit(X_train, y_train_seasonal)

# Best model
best_model_xyz = grid_search_xyz.best_estimator_
best_model_seasonal = grid_search_seasonal.best_estimator_



In [31]:
# Step 5: Make predictions
y_pred_prob_xyz = best_model_xyz.predict_proba(X_test)[:, 1]
y_pred_prob_seasonal = best_model_seasonal.predict_proba(X_test)[:, 1]

# Evaluate the model
roc_auc_xyz = roc_auc_score(y_test_xyz, y_pred_prob_xyz)
roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_prob_seasonal)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')



ROC AUC for xyz_vaccine: 0.8299536593160755
ROC AUC for seasonal_vaccine: 0.8590536979071596
Mean ROC AUC: 0.8445036786116176


In [34]:
# Prepare the submission
submission_Divyansh = pd.DataFrame({
    'respondent_id': features['respondent_id'],
    'xyz_vaccine': best_model_xyz.predict_proba(X)[:, 1],
    'seasonal_vaccine': best_model_seasonal.predict_proba(X)[:, 1]
})

# Save the submission
submission_Divyansh.to_csv('submission_Divyansh.csv', index=False)