In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
training_features_data=pd.read_csv("training_set_features.csv")
training_set_labels=pd.read_csv("training_set_labels.csv")
test_features_data=pd.read_csv("test_set_features.csv")

In [5]:
print(test_features_data.shape)
print(training_set_labels.shape)

(26708, 36)
(26707, 3)


# **Reading the data**

In [6]:
numeric_cols = training_features_data.select_dtypes(include=['number']).columns
string_cols = training_features_data.select_dtypes(include=['object']).columns

training_features_data[numeric_cols] = training_features_data[numeric_cols].fillna(training_features_data[numeric_cols].mean())


training_features_data[string_cols] = training_features_data[string_cols].fillna('out-of-category')

In [8]:
training_features_data.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [9]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
enc.fit(training_features_data)
training_features_data_arr=enc.transform(training_features_data)
col_name_list=training_features_data.columns
encoded_categorical_df=pd.DataFrame(training_features_data_arr,columns=col_name_list)

In [11]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(encoded_categorical_df)
normalized_arr=scaler.transform(encoded_categorical_df)

normalized_df=pd.DataFrame(normalized_arr,columns=col_name_list)

In [13]:
test_features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   xyz_concern                  26623 non-null  float64
 2   xyz_knowledge                26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_xyz              24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

# **Data processing for Test**

In [15]:
numeric_cols_test = test_features_data.select_dtypes(include=['number']).columns
string_cols_test = test_features_data.select_dtypes(include=['object']).columns
test_features_data[numeric_cols_test] = test_features_data[numeric_cols].fillna(test_features_data[numeric_cols_test].mean())


test_features_data[string_cols_test] = test_features_data[string_cols_test].fillna('out-of-category')

In [17]:
test_features_data.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [19]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
enc.fit(test_features_data)
test_features_data_arr=enc.transform(test_features_data)
col_name_list_test=test_features_data.columns
test_encoded_categorical_df=pd.DataFrame(test_features_data_arr,columns=col_name_list_test)

In [21]:
test_encoded_categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  float64
 1   xyz_concern                  26708 non-null  float64
 2   xyz_knowledge                26708 non-null  float64
 3   behavioral_antiviral_meds    26708 non-null  float64
 4   behavioral_avoidance         26708 non-null  float64
 5   behavioral_face_mask         26708 non-null  float64
 6   behavioral_wash_hands        26708 non-null  float64
 7   behavioral_large_gatherings  26708 non-null  float64
 8   behavioral_outside_home      26708 non-null  float64
 9   behavioral_touch_face        26708 non-null  float64
 10  doctor_recc_xyz              26708 non-null  float64
 11  doctor_recc_seasonal         26708 non-null  float64
 12  chronic_med_condition        26708 non-null  float64
 13  child_under_6_mo

In [22]:
test_normalized_arr=scaler.transform(test_encoded_categorical_df)

test_normalized_df=pd.DataFrame(test_normalized_arr,columns=col_name_list_test)

# **Regression**

In [23]:
from sklearn.metrics import roc_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

In [26]:
y=training_set_labels.loc[:,'seasonal_vaccine'].values
X=normalized_df.values

In [25]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,stratify=y)
cv=StratifiedShuffleSplit(n_splits=5,random_state=42)

In [27]:
regressor=DecisionTreeRegressor(random_state=0)
parameters={
    "criterion": ["squared_error", "friedman_mse", "absolute_error"],
    "splitter":['best','random'],

}
grid=GridSearchCV(estimator=regressor,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))
detailed_grid_results=pd.DataFrame(grid.cv_results_)
detailed_grid_results

the best Parameters are {'criterion': 'friedman_mse', 'splitter': 'best'} with score of -0.3486


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.427297,0.069621,0.007087,0.004589,squared_error,best,"{'criterion': 'squared_error', 'splitter': 'be...",-0.440362,-0.395613,-0.300521,-0.292131,-0.325693,-0.350864,0.057671,2
1,0.188818,0.034018,0.001596,6.9e-05,squared_error,random,"{'criterion': 'squared_error', 'splitter': 'ra...",-0.415191,-0.423582,-0.297725,-0.308912,-0.39002,-0.367086,0.053342,3
2,0.687887,0.154671,0.00654,0.007797,friedman_mse,best,"{'criterion': 'friedman_mse', 'splitter': 'best'}",-0.440362,-0.395613,-0.300521,-0.280944,-0.325693,-0.348627,0.060074,1
3,0.323453,0.140382,0.009801,0.00962,friedman_mse,random,"{'criterion': 'friedman_mse', 'splitter': 'ran...",-0.415191,-0.423582,-0.297725,-0.356458,-0.370442,-0.372679,0.045353,4
4,72.192452,12.559566,0.003971,0.001588,absolute_error,best,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.658514,-0.658514,-0.524267,-0.622156,-0.571813,-0.607053,0.052192,6
5,30.66255,5.316272,0.00207,0.000143,absolute_error,random,"{'criterion': 'absolute_error', 'splitter': 'r...",-0.518673,-0.552235,-0.465534,-0.485112,-0.594188,-0.523148,0.046214,5


In [28]:
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")

    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [29]:
y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

TEST SCORES

AUC: 0.6907




# **Regressor 2:Bayesian Ridge**

In [30]:
clf_ridge=linear_model.BayesianRidge()

parameters={
    'alpha_init':[None,1],
    'lambda_init':[1,1e-3],
}

grid=GridSearchCV(estimator=clf_ridge,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

the best Parameters are {'alpha_init': None, 'lambda_init': 0.001} with score of 0.2846
TEST SCORES

AUC: 0.8384




# **Regressor-3:SVR**

In [31]:
regr = SVR(C=1.0, epsilon=0.2)
parameters={
    'kernel':['linear','poly', 'rbf', 'sigmoid'],
    'C':[0.01,0.1,1,10],
    'max_iter':[100,1000]
}

grid=GridSearchCV(estimator=regr,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)



the best Parameters are {'C': 0.1, 'kernel': 'poly', 'max_iter': 1000} with score of 0.0701
TEST SCORES

AUC: 0.7408




# **Regressor_4:SGDRegressor**

In [32]:
reg=SGDRegressor( tol=1e-3)
parameters = {
                'alpha': [0.0001, 0.001, 0.01, 1],
                'max_iter': [10,100,1000],
                'learning_rate': ['invscaling', 'optimal', 'adaptive'],
            }
grid=GridSearchCV(estimator=reg,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

the best Parameters are {'alpha': 0.001, 'learning_rate': 'adaptive', 'max_iter': 100} with score of 0.2848
TEST SCORES

AUC: 0.8383




# **Regressor-5 : RandomForestRegressor**

In [33]:
rfr=RandomForestRegressor(random_state=0)
parameters={
    'n_estimators':[20,50,100],
}
grid=GridSearchCV(estimator=rfr,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

the best Parameters are {'n_estimators': 100} with score of 0.3342
TEST SCORES

AUC: 0.8465




T-test

In [34]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from scipy import stats


In [35]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error
import numpy as np

model_1 = RandomForestRegressor(n_estimators=100, random_state=0)
model_2 = SGDRegressor(alpha=0.001, learning_rate='adaptive', max_iter=100)
model_3 = BayesianRidge(lambda_init=0.001)


n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


cv_mae_1 = []
cv_mae_2 = []
cv_mae_3 = []

# Cross-validation loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Model 1: RandomForestRegressor
    model_1.fit(X_train, y_train)
    pred_1 = model_1.predict(X_test)
    err_1 = mean_absolute_error(y_test, pred_1)
    cv_mae_1.append(err_1)

    # Model 2: SGDRegressor
    model_2.fit(X_train, y_train)
    pred_2 = model_2.predict(X_test)
    err_2 = mean_absolute_error(y_test, pred_2)
    cv_mae_2.append(err_2)

    # Model 3: BayesianRidge
    model_3.fit(X_train, y_train)
    pred_3 = model_3.predict(X_test)
    err_3 = mean_absolute_error(y_test, pred_3)
    cv_mae_3.append(err_3)


print("CV MAE for RandomForestRegressor: ", cv_mae_1)
print("CV MAE for SGDRegressor: ", cv_mae_2)
print("CV MAE for BayesianRidge: ", cv_mae_3)


CV MAE for RandomForestRegressor:  [0.3163365780606514, 0.31943466866342196, 0.3223759595581352, 0.32551394869874556, 0.31786369593709046]
CV MAE for SGDRegressor:  [0.33401445324035356, 0.33738354330462955, 0.3371355217581412, 0.34069049226830567, 0.33352955328555967]
CV MAE for BayesianRidge:  [0.3342494504007649, 0.3376250133554988, 0.3372509730816385, 0.3409070484994727, 0.33376171057338405]


In [36]:
from scipy import stats
print(stats.ttest_rel(cv_mae_1, cv_mae_2))
print(stats.ttest_rel(cv_mae_3, cv_mae_2))
print(stats.ttest_rel(cv_mae_3, cv_mae_1))

TtestResult(statistic=-24.71732745671027, pvalue=1.590087316605629e-05, df=4)
TtestResult(statistic=8.846016443967768, pvalue=0.0009016493826691848, df=4)
TtestResult(statistic=24.43176069771258, pvalue=1.6653181938285653e-05, df=4)


In [41]:
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X,y)

# prediction results
y_pred = rfr.predict(test_normalized_df)



In [39]:
import numpy as np

np.sum(np.logical_or(np.array(y_pred) > 1, np.array(y_pred) < 0), axis=0)

0

# **Converting to csv**

In [42]:
y_pred[:10]

array([0.29, 0.2 , 0.78, 0.93, 0.54, 0.75, 0.53, 0.04, 0.06, 0.9 ])

In [47]:
df_pred_seasonal_vaccine=pd.DataFrame(y_pred,columns=['seasonal_vaccine'])
df_pred_seasonal_vaccine["respondent_id"]=df_pred_seasonal_vaccine.index
df_pred_seasonal_vaccine=df_pred_seasonal_vaccine[['respondent_id','seasonal_vaccine']]

df_pred_seasonal_vaccine.to_csv('pred_seasonal_vaccine.csv',index=False)

In [46]:
df_pred_xyz_vaccine = pd.read_csv("pred_xyz_vaccine.csv")
df_pred_xyz_vaccine.head()

Unnamed: 0,respondent_id,xyz_vaccine
0,0,0.13
1,1,0.83
2,2,0.08
3,3,0.72
4,4,0.16


In [49]:
df_final = df_pred_xyz_vaccine.merge(df_pred_seasonal_vaccine, on="respondent_id", how = 'inner')

In [51]:
df_final.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.13,0.29
1,1,0.83,0.2
2,2,0.08,0.78
3,3,0.72,0.93
4,4,0.16,0.54


In [59]:
df_final2 = df_pred_xyz_vaccine.merge(df_pred_seasonal_vaccine, on="respondent_id", how = 'inner')

In [60]:
df_final2.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.13,0.29
1,1,0.83,0.2
2,2,0.08,0.78
3,3,0.72,0.93
4,4,0.16,0.54


In [61]:
df_final2['respondent_id'] = df_final['respondent_id'].astype(int) + 26707

In [63]:
df_final2.to_csv('faizan_ahmed_<Data_HACK>.csv', columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine' ],
                            index=False, sep=',')

In [64]:
df_final2['respondent_id'] = df_final['respondent_id'].astype(int) -80121

In [65]:
df_final2.to_csv('faizan_ahmed_<Data_HACK>_.csv', columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine' ],
                            index=False, sep=',')

In [68]:
df_final2.to_csv('faizan_ahmed_<Data_HACK>_submit.csv', columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine' ],
                            index=False, sep=',')

In [69]:
df_final2['respondent_id'] = df_final['respondent_id'].astype(int) + 26707

In [70]:
df_final2.to_csv('faizan_ahmed_<Data_HACK_IIT>_submit.csv', columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine' ],
                            index=False, sep=',')

In [71]:
df_final.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,80121,0.13,0.29
1,80122,0.83,0.2
2,80123,0.08,0.78
3,80124,0.72,0.93
4,80125,0.16,0.54


In [72]:
df_final['respondent_id'] = df_final['respondent_id'].astype(int) -80121

In [73]:
df_final.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.13,0.29
1,1,0.83,0.2
2,2,0.08,0.78
3,3,0.72,0.93
4,4,0.16,0.54


In [76]:
df_final['respondent_id'] = df_final['respondent_id'].astype(int) + 26707

In [77]:
df_final.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.13,0.29
1,26708,0.83,0.2
2,26709,0.08,0.78
3,26710,0.72,0.93
4,26711,0.16,0.54


In [78]:
df_final.to_csv('submit.csv', columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine' ],
                            index=False, sep=',')