In [1]:
import pandas as pd
import numpy as np
features=pd.read_csv('training_set_features.csv')
labels=pd.read_csv('training_set_labels.csv')

In [2]:
df = pd.merge(features, labels, on='respondent_id')

In [3]:
for col in df.columns:
    if col != 'respondent_id':
        df[col].fillna(df[col].mode()[0], inplace=True)

In [4]:
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 
    'marital_status', 'rent_or_own', 'employment_status', 
    'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation'
]

In [5]:
df_enc=pd.get_dummies(df,columns=categorical_features,drop_first=True)

In [6]:
from sklearn.model_selection import train_test_split
X= df_enc.drop(['xyz_vaccine', 'seasonal_vaccine'], axis=1)  
y = df_enc[['xyz_vaccine', 'seasonal_vaccine']] 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, random_state=42)

In [7]:
from sklearn.linear_model import LogisticRegression
model_xyz = LogisticRegression(max_iter=1000)
model_seasonal = LogisticRegression(max_iter=1000)

model_xyz.fit(X_train, y_train['xyz_vaccine'])
model_seasonal.fit(X_train, y_train['seasonal_vaccine'])
y_pred_xyz = model_xyz.predict_proba(X_test)[:, 1]  # Predict probabilities of class 1 (vaccinated)
y_pred_seasonal = model_seasonal.predict_proba(X_test)[:, 1]  # Predict probabilities of class 1 (vaccinated)


In [8]:
from sklearn.metrics import roc_auc_score
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_seasonal)

In [9]:
print(f"ROC AUC (xyz_vaccine): {roc_auc_xyz:.4f}")
print(f"ROC AUC (seasonal_vaccine): {roc_auc_seasonal:.4f}")

ROC AUC (xyz_vaccine): 0.7915
ROC AUC (seasonal_vaccine): 0.7753


In [10]:
test_features=pd.read_csv('test_set_features.csv')
test_features.head()
for col in test_features.columns:
    if col != 'respondent_id':
        test_features[col].fillna(test_features[col].mode()[0], inplace=True)

In [11]:
test_features_enc=pd.get_dummies(test_features,columns=categorical_features,drop_first=True)
test_features_enc.shape

(26708, 94)

In [12]:
y_pred_xyz = model_xyz.predict_proba(test_features_enc)[:, 1]  # Predict probabilities of class 1 (vaccinated) for xyz_vaccine
y_pred_seasonal = model_seasonal.predict_proba(test_features_enc)[:, 1]  # Predict probabilities of class 1 (vaccinated) for seasonal_vaccine

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame({
    'respondent_id': test_features_enc['respondent_id'],  # Assuming respondent_id is a column in your test set
    'xyz_vaccine_prob': y_pred_xyz,
    'seasonal_vaccine_prob': y_pred_seasonal
})


In [15]:
predictions_df.head()

Unnamed: 0,respondent_id,xyz_vaccine_prob,seasonal_vaccine_prob
0,26707,0.045612,0.267245
1,26708,0.051111,0.184491
2,26709,0.247184,0.461784
3,26710,0.700302,0.778481
4,26711,0.16397,0.430234


In [17]:
predictions_df.to_csv('predictions_using_logisticRegression.csv', index=False)