In [2]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# **Reading the data**

In [4]:
training_features_data=pd.read_csv("training_set_features.csv")
training_set_labels=pd.read_csv("training_set_labels.csv")
test_features_data=pd.read_csv("test_set_features.csv")

In [6]:
print(test_features_data.shape)
print(training_set_labels.shape)

(26708, 36)
(26707, 3)


# **Data PreProcessing For Train**

In [10]:
numeric_cols = training_features_data.select_dtypes(include=['number']).columns
string_cols = training_features_data.select_dtypes(include=['object']).columns

training_features_data[numeric_cols] = training_features_data[numeric_cols].fillna(training_features_data[numeric_cols].mean())


training_features_data[string_cols] = training_features_data[string_cols].fillna('out-of-category')

In [12]:
training_features_data.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [14]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
enc.fit(training_features_data)
training_features_data_arr=enc.transform(training_features_data)
col_name_list=training_features_data.columns
encoded_categorical_df=pd.DataFrame(training_features_data_arr,columns=col_name_list)

In [16]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(encoded_categorical_df)
normalized_arr=scaler.transform(encoded_categorical_df)

normalized_df=pd.DataFrame(normalized_arr,columns=col_name_list)

# **Data processing for Test**

In [18]:
test_features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   xyz_concern                  26623 non-null  float64
 2   xyz_knowledge                26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_xyz              24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

In [20]:
numeric_cols_test = test_features_data.select_dtypes(include=['number']).columns
string_cols_test = test_features_data.select_dtypes(include=['object']).columns
test_features_data[numeric_cols_test] = test_features_data[numeric_cols].fillna(test_features_data[numeric_cols_test].mean())


test_features_data[string_cols_test] = test_features_data[string_cols_test].fillna('out-of-category')

In [22]:
test_features_data.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [23]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
enc.fit(test_features_data)
test_features_data_arr=enc.transform(test_features_data)
col_name_list_test=test_features_data.columns
test_encoded_categorical_df=pd.DataFrame(test_features_data_arr,columns=col_name_list_test)

In [24]:
test_encoded_categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  float64
 1   xyz_concern                  26708 non-null  float64
 2   xyz_knowledge                26708 non-null  float64
 3   behavioral_antiviral_meds    26708 non-null  float64
 4   behavioral_avoidance         26708 non-null  float64
 5   behavioral_face_mask         26708 non-null  float64
 6   behavioral_wash_hands        26708 non-null  float64
 7   behavioral_large_gatherings  26708 non-null  float64
 8   behavioral_outside_home      26708 non-null  float64
 9   behavioral_touch_face        26708 non-null  float64
 10  doctor_recc_xyz              26708 non-null  float64
 11  doctor_recc_seasonal         26708 non-null  float64
 12  chronic_med_condition        26708 non-null  float64
 13  child_under_6_mo

In [25]:
test_normalized_arr=scaler.transform(test_encoded_categorical_df)

test_normalized_df=pd.DataFrame(test_normalized_arr,columns=col_name_list_test)

# **Regression**

In [27]:
from sklearn.metrics import roc_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

In [30]:
y=training_set_labels.loc[:,'xyz_vaccine'].values
X=normalized_df.values

In [31]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,stratify=y)
cv=StratifiedShuffleSplit(n_splits=5,random_state=42)

# Regressor 1: Decision Tree **Regressor**

In [36]:
regressor=DecisionTreeRegressor(random_state=0)
parameters={
    "criterion": ["squared_error", "friedman_mse", "absolute_error"],
    "splitter":['best','random'],

}
grid=GridSearchCV(estimator=regressor,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))
detailed_grid_results=pd.DataFrame(grid.cv_results_)
detailed_grid_results

the best Parameters are {'criterion': 'squared_error', 'splitter': 'random'} with score of -0.3369


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.339717,0.07684,0.001678,9.1e-05,squared_error,best,"{'criterion': 'squared_error', 'splitter': 'be...",-0.431972,-0.395613,-0.317302,-0.376036,-0.252975,-0.35478,0.063002,3
1,0.133731,0.003536,0.001723,0.000122,squared_error,random,"{'criterion': 'squared_error', 'splitter': 'ra...",-0.303318,-0.373239,-0.32849,-0.39002,-0.289334,-0.33688,0.038995,1
2,0.255094,0.008236,0.001725,0.000163,friedman_mse,best,"{'criterion': 'friedman_mse', 'splitter': 'best'}",-0.431972,-0.395613,-0.297725,-0.339677,-0.278147,-0.348627,0.057953,2
3,0.132274,0.003102,0.001724,8.3e-05,friedman_mse,random,"{'criterion': 'friedman_mse', 'splitter': 'ran...",-0.356458,-0.434769,-0.308912,-0.392816,-0.367645,-0.37212,0.041506,4
4,58.462951,6.804427,0.002267,0.000206,absolute_error,best,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.624953,-0.655718,-0.577407,-0.569016,-0.627749,-0.610968,0.03276,6
5,31.310907,4.2906,0.003597,0.002726,absolute_error,random,"{'criterion': 'absolute_error', 'splitter': 'r...",-0.57461,-0.426378,-0.518673,-0.588594,-0.45994,-0.513639,0.063022,5


In [39]:
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")

    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [40]:
y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

TEST SCORES

AUC: 0.6767




# **Regressor_2 : Bayesian_Ridge**

In [42]:
clf_ridge=linear_model.BayesianRidge()

parameters={
    'alpha_init':[None,1],
    'lambda_init':[1,1e-3],
}

grid=GridSearchCV(estimator=clf_ridge,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

the best Parameters are {'alpha_init': None, 'lambda_init': 0.001} with score of 0.2750
TEST SCORES

AUC: 0.8517




# **Regressor-3:SVR**

In [43]:
regr = SVR(C=1.0, epsilon=0.2)
parameters={
    'kernel':['linear','poly', 'rbf', 'sigmoid'],
    'C':[0.01,0.1,1,10],
    'max_iter':[100,1000]
}

grid=GridSearchCV(estimator=regr,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)



the best Parameters are {'C': 0.1, 'kernel': 'poly', 'max_iter': 1000} with score of 0.0214
TEST SCORES

AUC: 0.7211




# **Regressor_4:SGDRegressor**

In [45]:
reg=SGDRegressor( tol=1e-3)
parameters = {
                'alpha': [0.0001, 0.001, 0.01, 1],
                'max_iter': [10,100,1000],
                'learning_rate': ['invscaling', 'optimal', 'adaptive'],
            }
grid=GridSearchCV(estimator=reg,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

the best Parameters are {'alpha': 0.0001, 'learning_rate': 'adaptive', 'max_iter': 1000} with score of 0.2751
TEST SCORES

AUC: 0.8518




# **Regressor-5 : RandomForestRegressor**

In [46]:
rfr=RandomForestRegressor(random_state=0)
parameters={
    'n_estimators':[20,50,100],
}
grid=GridSearchCV(estimator=rfr,param_grid=parameters,cv=cv,n_jobs=-1)
grid.fit(X_train,y_train)

print("the best Parameters are %s with score of %0.4f"
      %(grid.best_params_,grid.best_score_))

y_pred=grid.predict(X_test)
result,false=display_test_scores(y_test,y_pred)
print(result)

the best Parameters are {'n_estimators': 100} with score of 0.3224
TEST SCORES

AUC: 0.8612




# **T-Test**

In [51]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from scipy import stats


In [58]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error
import numpy as np

model_1 = RandomForestRegressor(n_estimators=100, random_state=0)
model_2 = SGDRegressor(alpha=0.001, learning_rate='adaptive', max_iter=100)
model_3 = BayesianRidge(lambda_init=0.001)


n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


cv_mae_1 = []
cv_mae_2 = []
cv_mae_3 = []

# Cross-validation loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Model 1: RandomForestRegressor
    model_1.fit(X_train, y_train)
    pred_1 = model_1.predict(X_test)
    err_1 = mean_absolute_error(y_test, pred_1)
    cv_mae_1.append(err_1)

    # Model 2: SGDRegressor
    model_2.fit(X_train, y_train)
    pred_2 = model_2.predict(X_test)
    err_2 = mean_absolute_error(y_test, pred_2)
    cv_mae_2.append(err_2)

    # Model 3: BayesianRidge
    model_3.fit(X_train, y_train)
    pred_3 = model_3.predict(X_test)
    err_3 = mean_absolute_error(y_test, pred_3)
    cv_mae_3.append(err_3)


print("CV MAE for RandomForestRegressor: ", cv_mae_1)
print("CV MAE for SGDRegressor: ", cv_mae_2)
print("CV MAE for BayesianRidge: ", cv_mae_3)


CV MAE for RandomForestRegressor:  [0.2261194309247473, 0.2308629726694122, 0.22760344504774388, 0.2322299194907321, 0.22618423516195468]
CV MAE for SGDRegressor:  [0.26358235818860143, 0.2720081510500753, 0.2653004603844399, 0.2681791943640235, 0.2618986056252351]
CV MAE for BayesianRidge:  [0.2637352565503448, 0.27200114709726925, 0.26532253123474414, 0.26826235933346115, 0.26195495933259155]


In [60]:
from scipy import stats
print(stats.ttest_rel(cv_mae_1, cv_mae_2))
print(stats.ttest_rel(cv_mae_3, cv_mae_2))
print(stats.ttest_rel(cv_mae_3, cv_mae_1))

TtestResult(statistic=-38.6960502428155, pvalue=2.6641188692193364e-06, df=4)
TtestResult(statistic=2.238322187322946, pvalue=0.08878701591868589, df=4)
TtestResult(statistic=39.342494168737694, pvalue=2.493649765252259e-06, df=4)


In [61]:
rfr=RandomForestRegressor(n_estimators=100,random_state=0)
rfr.fit(X,y)
y_pred=rfr.predict(X_test)


In [63]:
import numpy as np
np.sum(np.logical_or(np.array(y_pred) > 1, np.array(y_pred) < 0), axis=0)

0

In [65]:
y_pred[:10]

array([0.13, 0.83, 0.08, 0.72, 0.16, 0.05, 0.16, 0.02, 0.13, 0.71])

# **Converting to csv**

In [67]:
df_pred_xyz=pd.DataFrame(y_pred,columns=['xyz_vaccine'])
df_pred_xyz["respondent_id"]=df_pred_xyz.index
df_pred_xyz=df_pred_xyz[['respondent_id','xyz_vaccine']]

df_pred_xyz.to_csv('pred_xyz_vaccine.csv',index=False)