<font color='blue'><h1>Importing Libraries</h1></font>


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

### <font color = 'blue'>Reading the data</font>

In [15]:
features_train = pd.read_csv('training_set_features.csv')
labels_train = pd.read_csv('training_set_labels.csv')
features_test = pd.read_csv('test_set_features.csv')

### <font color = 'blue'>Data preprocessing</font>

In [16]:
# Separating target and predictors
X = features_train.drop(columns=['respondent_id'])
y = labels_train.drop(columns=['respondent_id'])
X_test = features_test.drop(columns=['respondent_id'])

# Spliting the data into numerical and categorial
cat_cols = [col for col in X.columns if X[col].dtype == "object"]
num_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Numerical Data Preprocessing (Replacing missing values with most_frequent)
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())])


# Categorial Data Preprocessing
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
data_preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)])


### <font color = 'blue'>Building Model</font>

In [17]:
# Model definition (RandonForestClassifier)
classifier = RandomForestClassifier(n_estimators=50, random_state=0)

# Creating and evaluating the pipeline
full_pipeline = Pipeline(steps=[('preprocessor', data_preprocessor),
                                ('classifier', classifier)])

# Splitting  and fitting the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
full_pipeline.fit(X_train, y_train)

# Predict on validation
val_predictions = full_pipeline.predict_proba(X_val)

# Calculation of ROC AUC score
roc_auc_results = {target: roc_auc_score(y_val[target], val_predictions[i][:, 1]) for i, target in enumerate(y_train.columns)}

# Predict on test data
test_predictions = full_pipeline.predict_proba(X_test)

### <font color = 'blue'> Creating the submission file(.csv) </font>

In [18]:
# Extract probabilities for each vaccine
xyz_vaccine_probs = test_predictions[0][:, 1]
seasonal_vaccine_probs = test_predictions[1][:, 1]

# Create a submission DataFrame
submission_df = pd.DataFrame({
    "respondent_id": features_test['respondent_id'],
    "xyz_vaccine": xyz_vaccine_probs,
    "seasonal_vaccine": seasonal_vaccine_probs
})

# Save the submission DataFrame to a CSV file
submission_file_path = 'submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print("File saved to:", submission_file_path)
print("ROC AUC scores:", roc_auc_results)


File saved to: submission.csv
ROC AUC scores: {'xyz_vaccine': 0.8093893150101205, 'seasonal_vaccine': 0.8509388570391954}
