In [1]:
import pandas as pd
import numpy as np
features=pd.read_csv('training_set_features.csv')
labels=pd.read_csv('training_set_labels.csv')

In [2]:
features.shape

(26707, 36)

In [3]:
labels.shape

(26707, 3)

In [6]:
features.mode().iloc[0]

respondent_id                                          0
xyz_concern                                          2.0
xyz_knowledge                                        1.0
behavioral_antiviral_meds                            0.0
behavioral_avoidance                                 1.0
behavioral_face_mask                                 0.0
behavioral_wash_hands                                1.0
behavioral_large_gatherings                          0.0
behavioral_outside_home                              0.0
behavioral_touch_face                                1.0
doctor_recc_xyz                                      0.0
doctor_recc_seasonal                                 0.0
chronic_med_condition                                0.0
child_under_6_months                                 0.0
health_worker                                        0.0
health_insurance                                     1.0
opinion_xyz_vacc_effective                           4.0
opinion_xyz_risk               

In [7]:
df = pd.merge(features, labels, on='respondent_id')

In [19]:

for col in df.columns:
    if col != 'respondent_id':
        df[col].fillna(df[col].mode()[0], inplace=True)

In [21]:
df.shape

(26707, 38)

In [22]:
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 
    'marital_status', 'rent_or_own', 'employment_status', 
    'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation'
]

In [23]:
df_enc=pd.get_dummies(df,columns=categorical_features,drop_first=True)
df_enc.shape

(26707, 96)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [26]:
X= df_enc.drop(['xyz_vaccine', 'seasonal_vaccine'], axis=1)  
y = df_enc[['xyz_vaccine', 'seasonal_vaccine']] 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, random_state=42)

In [28]:
model_xyz = RandomForestClassifier(random_state=42)
model_seasonal = RandomForestClassifier(random_state=42)

model_xyz.fit(X_train, y_train['xyz_vaccine'])
model_seasonal.fit(X_train, y_train['seasonal_vaccine'])


y_pred_xyz = model_xyz.predict_proba(X_test)[:, 1]  # Predict probabilities of class 1 (vaccinated)
y_pred_seasonal = model_seasonal.predict_proba(X_test)[:, 1]  # Predict probabilities of class 1 (vaccinated)


In [29]:
from sklearn.metrics import roc_auc_score
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_seasonal)

In [30]:
print(f"ROC AUC (xyz_vaccine): {roc_auc_xyz:.4f}")
print(f"ROC AUC (seasonal_vaccine): {roc_auc_seasonal:.4f}")

ROC AUC (xyz_vaccine): 0.8311
ROC AUC (seasonal_vaccine): 0.8521


In [31]:
test_features=pd.read_csv('test_set_features.csv')
test_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [32]:
for col in test_features.columns:
    if col != 'respondent_id':
        test_features[col].fillna(test_features[col].mode()[0], inplace=True)

In [34]:
test_features_enc=pd.get_dummies(test_features,columns=categorical_features,drop_first=True)
test_features_enc.shape

(26708, 94)

In [35]:
y_pred_xyz = model_xyz.predict_proba(test_features_enc)[:, 1]  # Predict probabilities of class 1 (vaccinated) for xyz_vaccine
y_pred_seasonal = model_seasonal.predict_proba(test_features_enc)[:, 1]  # Predict probabilities of class 1 (vaccinated) for seasonal_vaccine

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame({
    'respondent_id': test_features_enc['respondent_id'],  # Assuming respondent_id is a column in your test set
    'xyz_vaccine_prob': y_pred_xyz,
    'seasonal_vaccine_prob': y_pred_seasonal
})


In [37]:
predictions_df.tail()

Unnamed: 0,respondent_id,xyz_vaccine_prob,seasonal_vaccine_prob
26703,53410,0.32,0.43
26704,53411,0.14,0.47
26705,53412,0.12,0.35
26706,53413,0.22,0.42
26707,53414,0.4,0.55


In [None]:
predictions_df.to_csv('path_to_save_predictions.csv', index=False)