In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score

In [6]:
train_df = pd.read_csv("C:\\Users\\riyat\\Downloads\\training_set_features.csv")
labels_df = pd.read_csv("C:\\Users\\riyat\\Downloads\\training_set_labels.csv")
test_df = pd.read_csv("C:\\Users\\riyat\\Downloads\\test_set_features.csv")

In [7]:
#MERGE LABELS
train_df = train_df.merge(labels_df, on='respondent_id')
X = train_df.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])

In [8]:
y = train_df[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_df.drop(columns=['respondent_id'])
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [9]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [11]:
X_preprocessed = preprocessor.fit_transform(X)
X_test_preprocessed = preprocessor.transform(X_test)
# MODEL
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))
#TRAINING
model.fit(X_preprocessed, y)


In [12]:
def multi_label_roc_auc_score(y_true, y_pred, average="macro"):
    return roc_auc_score(y_true, y_pred, average=average)
scorer = make_scorer(multi_label_roc_auc_score, needs_proba=True)
cv_scores = cross_val_score(model, X_preprocessed, y, cv=5, scoring=scorer)

Traceback (most recent call last):
  File "C:\Users\riyat\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\riyat\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\riyat\AppData\Local\Temp\ipykernel_17208\3768864481.py", line 2, in multi_label_roc_auc_score
    return roc_auc_score(y_true, y_pred, average=average)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\riyat\anaconda3\Lib\site-packages\sklearn\metrics\_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\riyat\anaconda3\Lib\site-packages\skl

In [16]:
print(f'Cross-validated ROC AUC score: {np.mean(cv_scores)}')
print(f'Cross-validated ROC AUC score: {np.mean(cv_scores)}')
predictions = model.predict_proba(X_test_preprocessed)
xyz_vaccine_prob = predictions[0][:, 1]
seasonal_vaccine_prob = predictions[1][:, 1]

Cross-validated ROC AUC score: nan
Cross-validated ROC AUC score: nan


In [17]:
#FILE CREATION
submission_df = pd.DataFrame({
    'respondent_id': test_df['respondent_id'],
    'xyz_vaccine': xyz_vaccine_prob,
    'seasonal_vaccine': seasonal_vaccine_prob
})

submission_df.to_csv('submission.csv', index=False)
print("Submission file created.")

Submission file created.
