In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy import stats
training_set_features = pd.read_csv('dataset and all/training_set_features.csv')
training_set_labels = pd.read_csv('dataset and all/training_set_labels.csv')
test_set_features = pd.read_csv('dataset and all/test_set_features.csv')

# Display the first few rows of each dataframe to understand their structure
print('training_set_features','\n\n\n')
print(training_set_features.head(),'\n\n\n')
print(training_set_features.info(),'\n\n\n')
print('training_set_labels','\n\n\n')
print(training_set_labels.head(),'\n\n\n')
print(training_set_labels.info(),'\n\n\n')
print('test_set_features','\n\n\n')
print(test_set_features.head(),'\n\n\n')
print(test_set_features.info())

training_set_features 



   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0 

In [3]:
training_data = pd.merge(training_set_features, training_set_labels, on='respondent_id')

numerical_columns = training_data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = training_data.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
training_data[numerical_columns] = imputer_num.fit_transform(training_data[numerical_columns])

imputer_cat = SimpleImputer(strategy='most_frequent')
training_data[categorical_columns] = imputer_cat.fit_transform(training_data[categorical_columns])

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(training_data[categorical_columns])
training_encoded = encoder.transform(training_data[categorical_columns])
training_encoded = pd.DataFrame(training_encoded, columns=encoder.get_feature_names_out(categorical_columns))

training_final = pd.concat([training_data[numerical_columns].reset_index(drop=True), training_encoded], axis=1)

X_xyz = training_final.drop(['xyz_vaccine', 'seasonal_vaccine', 'respondent_id'], axis=1) 
X_seasonal = training_final.drop(['xyz_vaccine', 'seasonal_vaccine', 'respondent_id'], axis=1) 
y_xyz = training_final['xyz_vaccine'] 
y_seasonal = training_final['seasonal_vaccine'] 





In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X_train_xyz, X_val_xyz, y_train_xyz, y_val_xyz = train_test_split(X_xyz, y_xyz, test_size=0.2, random_state=42)

model_xyz = LogisticRegression(max_iter=1000)
model_xyz.fit(X_train_xyz, y_train_xyz)

y_pred_proba_xyz = model_xyz.predict_proba(X_val_xyz)[:, 1]
auc_xyz = roc_auc_score(y_val_xyz, y_pred_proba_xyz)
print(f"ROC AUC Score for xyz_vaccine: {auc_xyz}")

ROC AUC Score for xyz_vaccine: 0.8313562598223365


In [5]:
X_train_seasonal, X_val_seasonal, y_train_seasonal, y_val_seasonal = train_test_split(X_seasonal, y_seasonal, test_size=0.2, random_state=42)

model_seasonal = LogisticRegression(max_iter=1000, random_state=42)
model_seasonal.fit(X_train_seasonal, y_train_seasonal)

y_pred_proba_seasonal = model_seasonal.predict_proba(X_val_seasonal)[:, 1]
auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_proba_seasonal)
print(f"ROC AUC Score for seasonal_vaccine using Logistic Regression: {auc_seasonal}")

ROC AUC Score for seasonal_vaccine using Logistic Regression: 0.8560587233046861


In [6]:
print(test_set_features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   xyz_concern                  26623 non-null  float64
 2   xyz_knowledge                26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_xyz              24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

In [7]:
test_data = test_set_features.drop('respondent_id', axis=1)

numerical_columns = test_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_columns = test_data.select_dtypes(include=['object']).columns.tolist()

imputer_num = SimpleImputer(strategy='median')
test_data[numerical_columns] = imputer_num.fit_transform(test_data[numerical_columns])

imputer_cat = SimpleImputer(strategy='most_frequent')
test_data[categorical_columns] = imputer_cat.fit_transform(test_data[categorical_columns])

test_encoded = encoder.transform(test_data[categorical_columns])
test_encoded = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_columns))

test_final = pd.concat([test_data[numerical_columns].reset_index(drop=True), test_encoded], axis=1)

columns_used_for_training = X_xyz.columns.tolist()  
test_final = test_final[columns_used_for_training]

test_predictions_xyz = model_xyz.predict_proba(test_final)[:, 1]

test_predictions_seasonal = model_seasonal.predict_proba(test_final)[:, 1]

submission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': test_predictions_xyz,
    'seasonal_vaccine': test_predictions_seasonal
})

submission.to_csv('submission.csv', index=False)


In [8]:
pdf = pd.read_csv('submission.csv')
print(pdf.info())
print(pdf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   respondent_id     26708 non-null  int64  
 1   xyz_vaccine       26708 non-null  float64
 2   seasonal_vaccine  26708 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 626.1 KB
None
       respondent_id  xyz_vaccine  seasonal_vaccine
0              26707     0.050175          0.297060
1              26708     0.046431          0.046511
2              26709     0.365327          0.517272
3              26710     0.513616          0.881449
4              26711     0.149680          0.456914
...              ...          ...               ...
26703          53410     0.344343          0.539522
26704          53411     0.093654          0.285505
26705          53412     0.136086          0.196312
26706          53413     0.059901          0.361347
26707          53414     0.5