In [38]:
import pandas as pd

# Load the data from the CSV files First, let's get the features and load the target labels
features = pd.read_csv('/content/drive/MyDrive/dataset and all/training_set_features.csv')
labels = pd.read_csv('/content/drive/MyDrive/dataset and all/training_set_labels.csv')
# Merge the features and labels based on 'respondent_id'
data = features.merge(labels, on='respondent_id')

# Separate the features (X) from the targets (y)
# We don't need 'respondent_id', 'xyz_vaccine', and 'seasonal_vaccine' in X
X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify which features are numerical and which are categorical
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [39]:
#Making all pipelines and firstly importing all libraries required
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# For numerical data, we'll fill in missing values with the median and then scale the data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# For categorical data, we'll fill in missing values with the most frequent value and then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the numerical and categorical transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

# Define the model as a multi-output classifier using a random forest
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Create a pipeline that first preprocesses the data, then fits the model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

In [40]:
#diving into test and train

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
clf.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_prob = clf.predict_proba(X_test)

# Extract the probabilities for each target variable
xyz_vaccine_pred_prob = y_pred_prob[0][:, 1]
seasonal_vaccine_pred_prob = y_pred_prob[1][:, 1]


In [41]:
from sklearn.metrics import roc_auc_score

# Calculate the ROC AUC scores for both targets
xyz_vaccine_roc_auc = roc_auc_score(y_test['xyz_vaccine'], xyz_vaccine_pred_prob)
seasonal_vaccine_roc_auc = roc_auc_score(y_test['seasonal_vaccine'], seasonal_vaccine_pred_prob)

# Calculate the mean ROC AUC score
mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2

# Print the results
print(f'XYZ Vaccine ROC AUC: {xyz_vaccine_roc_auc}')
print(f'Seasonal Vaccine ROC AUC: {seasonal_vaccine_roc_auc}')
print(f'Mean ROC AUC: {mean_roc_auc}')


XYZ Vaccine ROC AUC: 0.8294325525888947
Seasonal Vaccine ROC AUC: 0.8518072872366175
Mean ROC AUC: 0.8406199199127561


In [43]:
# Load the test data from a different link for final predictions
features_test_new = pd.read_csv('/content/drive/MyDrive/dataset and all/test_set_features.csv')

# Save respondent_id for final submission
respondent_ids_new = features_test_new['respondent_id'].copy()

# Preprocess the new test data
X_test_new = features_test_new.drop(columns=['respondent_id'])

# Predict probabilities on the new test data
y_pred_prob_new = clf.predict_proba(X_test_new)

# Extract the probabilities for each target variable for the new test data
xyz_vaccine_pred_prob_new = y_pred_prob_new[0][:, 1]
seasonal_vaccine_pred_prob_new = y_pred_prob_new[1][:, 1]

# Create the submission DataFrame for the new test data
submission_new = pd.DataFrame({
    'respondent_id': respondent_ids_new,
    'xyz_vaccine': xyz_vaccine_pred_prob_new,
    'seasonal_vaccine': seasonal_vaccine_pred_prob_new
})

# Save the submission DataFrame for the new test data to a CSV file
submission_new.to_csv('/content/drive/My Drive/submission_new.csv', index=False)

submission

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.18,0.37
1,26708,0.07,0.06
2,26709,0.59,0.83
3,26710,0.50,0.91
4,26711,0.19,0.46
...,...,...,...
26703,53410,0.34,0.51
26704,53411,0.21,0.30
26705,53412,0.11,0.38
26706,53413,0.06,0.40
