In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import warnings
warnings.filterwarnings('ignore')

In [4]:
train = pd.read_csv("/content/training_set_features.csv", index_col='respondent_id')
test = pd.read_csv("/content/test_set_features.csv", index_col='respondent_id')
labels = pd.read_csv("/content/training_set_labels.csv", index_col='respondent_id')

In [5]:
df_train = train.copy()
df_test = train.copy()

In [6]:
print(df_train.info)


<bound method DataFrame.info of                xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
respondent_id                                                          
0                      1.0            0.0                        0.0   
1                      3.0            2.0                        0.0   
2                      1.0            1.0                        0.0   
3                      1.0            1.0                        0.0   
4                      2.0            1.0                        0.0   
...                    ...            ...                        ...   
26702                  2.0            0.0                        0.0   
26703                  1.0            2.0                        0.0   
26704                  2.0            2.0                        0.0   
26705                  1.0            1.0                        0.0   
26706                  0.0            0.0                        0.0   

               behavioral_avoid

In [7]:
sex_map={
    'Female':0,
    "Male":1
}
df_train["sex"]=df_train["sex"].map(sex_map)
df_test["sex"]=df_test["sex"].map(sex_map)

In [11]:
num = df_train.select_dtypes('float64').columns
cat = df_train.select_dtypes('object').columns

df_train_num = df_train.loc[:, num]
df_train_cat = df_train.loc[:, cat]

df_test_num = df_test.loc[:, num]
df_test_cat = df_test.loc[:, cat]

In [12]:
from sklearn.impute import SimpleImputer
df_train_num_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(df_train_num),
                                            columns = df_train_num.columns)
df_test_num_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(df_test_num),
                                           columns = df_test_num.columns)

In [13]:
from sklearn.preprocessing import StandardScaler
scaler_train = StandardScaler()
scaler_test = StandardScaler()

df_train_num_imputed = pd.DataFrame(scaler_train.fit_transform(df_train_num_imputed),
                                    columns = df_train_num_imputed.columns)

df_test_num_imputed = pd.DataFrame(scaler_test.fit_transform(df_test_num_imputed),
                                   columns = df_test_num_imputed.columns)

In [14]:
df_train_cat_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'no_response').fit_transform(df_train_cat),
                                            columns = df_train_cat.columns)
df_test_cat_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'no_response').fit_transform(df_test_cat),
                                           columns = df_test_cat.columns)

In [21]:
df_train = pd.concat([df_train_num_imputed, df_train_cat_imputed], axis = 1)
df_test = pd.concat([df_test_num_imputed, df_test_cat_imputed], axis = 1)

In [22]:
dummy_columns_train = pd.get_dummies(df_train[cat])
df_train = pd.concat((df_train, dummy_columns_train), axis=1)
df_train = df_train.drop(df_train[cat], axis=1)

dummy_columns_test = pd.get_dummies(df_test[cat])
df_test = pd.concat((df_test, dummy_columns_test), axis=1)
df_test = df_test.drop(df_test[cat], axis=1)

In [23]:
labels.columns

Index(['xyz_vaccine', 'seasonal_vaccine'], dtype='object')

In [24]:
X = df_train
X_test = df_test
y_xyz = labels['xyz_vaccine']
y_seas = labels['seasonal_vaccine']

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_xyz_train, X_xyz_val, y_xyz_train, y_xyz_val = train_test_split(X, y_xyz, test_size=0.30, random_state=7, shuffle=True)
X_seas_train, X_seas_val, y_seas_train, y_seas_val = train_test_split(X, y_seas, test_size=0.30, random_state=7, shuffle=True)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [28]:
X_xyz_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 18694 entries, 17737 to 25796
Columns: 110 entries, xyz_concern to employment_occupation_xzmlyyjv
dtypes: bool(87), float64(23)
memory usage: 5.0 MB


In [29]:
logistic_clf_xyz = LogisticRegression(max_iter=5000, random_state=7)
logistic_clf_xyz.fit(X_xyz_train, y_xyz_train)
y_pred = logistic_clf_xyz.predict_proba(X_xyz_val)
print('XYZ Logistic Regression ROC AUC score: {:.3}'.format(roc_auc_score(y_xyz_val, y_pred[:, 1])))

logistic_clf_seas = LogisticRegression(max_iter=5000, random_state=7)
logistic_clf_seas.fit(X_seas_train, y_seas_train)
y_seas_pred = logistic_clf_seas.predict_proba(X_seas_val)
print('Seasonal Logistic Regression ROC AUC score: {:.3}'.format(roc_auc_score(y_seas_val, y_pred[:, 1])))

XYZ Logistic Regression ROC AUC score: 0.833
Seasonal Logistic Regression ROC AUC score: 0.715


In [31]:
y_xyz_final = logistic_clf_xyz.predict_proba(X_test)[:, 1]
y_seas_final = logistic_clf_seas.predict_proba(X_test)[:, 1]

In [44]:
id_column_test = df_test.index.name = 'respondent_id'



y_preds = pd.DataFrame({
    'respondent_id': id_column_test,
    'xyz_vaccine': y_xyz_final,
    'seasonal_vaccine': y_seas_final
})


y_preds['respondent_id'] = y_preds['respondent_id']
y_preds.set_index('respondent_id', inplace=True)


y_preds.to_csv('submission.csv')


In [45]:
sub = pd.read_csv("submission.csv")
sub.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,respondent_id,0.028141,0.042938
1,respondent_id,0.10468,0.110102
2,respondent_id,0.023741,0.046209
3,respondent_id,0.095995,0.937697
4,respondent_id,0.038078,0.050788
