In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load the datasets
train_features = pd.read_csv('D:\\D drive\\Summer analytics\\Hackathon1\\dataset and all\\training_set_features.csv')
train_labels = pd.read_csv('D:\\D drive\\Summer analytics\\Hackathon1\\dataset and all\\training_set_labels.csv')
test_features = pd.read_csv('D:\\D drive\\Summer analytics\\Hackathon1\\dataset and all\\test_set_features.csv')
submission_format = pd.read_csv('D:\\D drive\\Summer analytics\\Hackathon1\\dataset and all\\submission_format.csv')

# Display the first few rows of each dataset
print(train_features.head())
print(train_labels.head())
print(test_features.head())
print(submission_format.head())

# Check for missing values
print(train_features.isnull().sum())
print(train_labels.isnull().sum())
print(test_features.isnull().sum())

# Fill missing values with mode or median
train_features = train_features.fillna(train_features.mode().iloc[0])
test_features = test_features.fillna(test_features.mode().iloc[0])

# Convert categorical variables to numerical using one-hot encoding
train_features = pd.get_dummies(train_features, drop_first=True)
test_features = pd.get_dummies(test_features, drop_first=True)

# Ensure both train and test sets have the same dummy variables
train_features, test_features = train_features.align(test_features, join='inner', axis=1)

# The feature set and target variables
X = train_features
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_model = MultiOutputClassifier(rf_model, n_jobs=-1)

# Train the model
multi_target_model.fit(X_train, y_train)

# Predict probabilities on the validation set
y_val_pred = multi_target_model.predict_proba(X_val)
y_val_pred = pd.DataFrame({
    'xyz_vaccine': [prob[1] for prob in y_val_pred[0]],
    'seasonal_vaccine': [prob[1] for prob in y_val_pred[1]]
})

# Calculate ROC AUC score
xyz_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred['xyz_vaccine'])
seasonal_auc = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred['seasonal_vaccine'])
average_auc = (xyz_auc + seasonal_auc) / 2

print(f'XYZ Vaccine ROC AUC: {xyz_auc}')
print(f'Seasonal Vaccine ROC AUC: {seasonal_auc}')
print(f'Average ROC AUC: {average_auc}')

# Predict probabilities on the test set
test_pred = multi_target_model.predict_proba(test_features)
test_pred_df = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': [prob[1] for prob in test_pred[0]],
    'seasonal_vaccine': [prob[1] for prob in test_pred[1]]
})

# Save the predictions to the submission format
test_pred_df.to_csv('D:\\D drive\\Summer analytics\\Hackathon1\\dataset and all\\submission.csv', index=False)
