<a href="https://colab.research.google.com/github/dev180920/Devanshu_Priy_Datahack/blob/main/week3%20analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Step 1: Load the data
train_features = pd.read_csv('/content/training_set_features[1].csv')
train_labels = pd.read_csv('/content/training_set_labels[1].csv')
test_features = pd.read_csv('/content/test_set_features[1].csv')

# Step 2: Preprocess the data

# Separate the features and labels in the training set
X = train_features.drop(columns=['respondent_id'])
y = train_labels.drop(columns=['respondent_id'])

# List of categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 3: Train the model

# Create a pipeline with the preprocessor and classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set
y_pred_val = model.predict_proba(X_val)

# Calculate ROC AUC score for validation set
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_val[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_val[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'Validation ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'Validation ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Step 4: Predict probabilities on the test set
X_test = test_features.drop(columns=['respondent_id'])
y_pred_test = model.predict_proba(X_test)

# Create the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_pred_test[0][:, 1],  # Probability for xyz_vaccine
    'seasonal_vaccine': y_pred_test[1][:, 1]  # Probability for seasonal_vaccine
})

# Step 5: Save the predictions
submission.to_csv('/content/submission_format[1].csv', index=False)


Validation ROC AUC for xyz_vaccine: 0.8294325525888947
Validation ROC AUC for seasonal_vaccine: 0.8518072872366175
Mean ROC AUC: 0.8406199199127561
