In [8]:
import pandas as pd

# Loading the training features, labels and test features
train_features = pd.read_csv('C:/Users/dell/Desktop/Coding Projects/training_set_features.csv')
train_labels = pd.read_csv('C:/Users/dell/Desktop/Coding Projects/training_set_labels.csv')
test_features = pd.read_csv('C:/Users/dell/Desktop/Coding Projects/test_set_features.csv')

# Displaying the first few rows of each dataframe to understand their structure
print(train_features.head())
print(train_labels.head())
print(test_features.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [9]:
# Merging the training features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Display the first few rows of the merged dataframe
print(train_data.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Separating features and target variables
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']

categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Preprocessing the training data
X_preprocessed = preprocessor.fit_transform(X)

# Applying the same preprocessing to the test data
X_test_preprocessed = preprocessor.transform(test_features.drop(columns=['respondent_id']))


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train_xyz, y_val_xyz, y_train_seasonal, y_val_seasonal = train_test_split(X_preprocessed, y_xyz, y_seasonal, test_size=0.2, random_state=42)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Train logistic regression model for xyz vaccine
model_xyz = LogisticRegression(max_iter=1000)
model_xyz.fit(X_train, y_train_xyz)
y_pred_xyz = model_xyz.predict_proba(X_val)[:, 1]

# Train logistic regression model for seasonal vaccine
model_seasonal = LogisticRegression(max_iter=1000)
model_seasonal.fit(X_train, y_train_seasonal)
y_pred_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

# Evaluate the models
roc_auc_xyz = roc_auc_score(y_val_xyz, y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


ROC AUC for xyz vaccine: 0.8313484859945037
ROC AUC for seasonal vaccine: 0.8560609813288217
Mean ROC AUC: 0.8437047336616628


In [7]:
# Generating predictions for the test set
test_predictions_xyz = model_xyz.predict_proba(X_test_preprocessed)[:, 1]
test_predictions_seasonal = model_seasonal.predict_proba(X_test_preprocessed)[:, 1]

# Creating the submission dataframe
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_predictions_xyz,
    'seasonal_vaccine': test_predictions_seasonal
})

# Saving the submission file
submission.to_csv('C:/Users/dell/Desktop/Coding Projects/DishaArora_Datahack.csv', index=False)
