In [1]:
import pandas as pd

# Load the datasets
train_features = pd.read_csv("C:/Users/DELL/OneDrive/Desktop/Programming/training_set_features.csv")
train_labels = pd.read_csv("C:/Users/DELL/OneDrive/Desktop/Programming/training_set_labels.csv")
test_features = pd.read_csv('C:/Users/DELL/OneDrive/Desktop/Programming/test_set_features.csv')

# Display the first few rows of each dataset to understand their structure
train_features.head(), train_labels.head(), test_features.head()


(   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
 0              0          1.0            0.0                        0.0   
 1              1          3.0            2.0                        0.0   
 2              2          1.0            1.0                        0.0   
 3              3          1.0            1.0                        0.0   
 4              4          2.0            1.0                        0.0   
 
    behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
 0                   0.0                   0.0                    0.0   
 1                   1.0                   0.0                    1.0   
 2                   1.0                   0.0                    0.0   
 3                   1.0                   0.0                    1.0   
 4                   1.0                   0.0                    1.0   
 
    behavioral_large_gatherings  behavioral_outside_home  \
 0                          0.0           

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

# Combine training features and labels for preprocessing
train_data = train_features.merge(train_labels, on='respondent_id')

# Define numerical and categorical columns
numerical_cols = ['xyz_concern', 'xyz_knowledge']
categorical_cols = [col for col in train_features.columns if col not in numerical_cols + ['respondent_id']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Define the model
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Split data into train and validation sets
X = train_features.drop(columns=['respondent_id'])
y = train_labels.drop(columns=['respondent_id'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict probabilities for the validation set
y_pred = pipeline.predict_proba(X_valid)

# Extract the predicted probabilities for each class
y_pred_xyz = y_pred[0][:, 1]
y_pred_seasonal = y_pred[1][:, 1]

# Display first few predictions
y_pred_xyz[:5], y_pred_seasonal[:5]


(array([0.26, 0.14, 0.11, 0.25, 0.25]), array([0.25, 0.25, 0.63, 0.3 , 0.17]))

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Simplify the model to Logistic Regression
model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict probabilities for the validation set
y_pred = pipeline.predict_proba(X_valid)

# Extract the predicted probabilities for each class
y_pred_xyz = y_pred[0][:, 1]
y_pred_seasonal = y_pred[1][:, 1]

# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_seasonal)

roc_auc_xyz, roc_auc_seasonal


(0.8334610762339376, 0.8591524986236637)

In [4]:
# Predict probabilities for the test set
test_pred = pipeline.predict_proba(test_features.drop(columns=['respondent_id']))

# Extract the predicted probabilities for each class
test_pred_xyz = test_pred[0][:, 1]
test_pred_seasonal = test_pred[1][:, 1]

# Create the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_xyz,
    'seasonal_vaccine': test_pred_seasonal
})

# Save the submission DataFrame to a CSV file
submission_file = "C:/Users/DELL/OneDrive/Desktop/Programming/submission_format.csv"
submission.to_csv(submission_file, index=False)

submission.head()


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.049127,0.309242
1,26708,0.034713,0.033473
2,26709,0.362362,0.64136
3,26710,0.448479,0.846827
4,26711,0.193543,0.351911
