### IMPORTING LIBRARIES

In [29]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from category_encoders import BinaryEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, VarianceThreshold
from warnings import filterwarnings
filterwarnings('ignore')

### IMPORTING DATA

In [30]:
data = pd.read_csv("h1n1_vaccine_prediction.csv")
x = data.iloc[:, :-1]
y = data.iloc[:, -1].values
# data

### CLEANING DATA

#### Filling Columns having nan with Mode

In [31]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x.iloc[:, 1:10] = imp.fit_transform(x.iloc[:,1:10])
x.iloc[:, 12:23] = imp.fit_transform(x.iloc[:,12:23])
x.iloc[:, [29,31,32]] = imp.fit_transform(x.iloc[:,[29,31,32]])

#### Filling columns having extreme nan's with their least count element to reduce skewness

In [32]:
indices = [10,11,23,26,27,28]
for index in indices:
    col = x.iloc[:, index]
    val_counts = col.value_counts(dropna=True)
    min_item = val_counts.idxmin()
    x.iloc[:, index].fillna(min_item, inplace=True)

#### Checking for number of nan's in each column

In [33]:
for col in x:
    print(col, x[col].isna().sum())

unique_id 0
h1n1_worry 0
h1n1_awareness 0
antiviral_medication 0
contact_avoidance 0
bought_face_mask 0
wash_hands_frequently 0
avoid_large_gatherings 0
reduced_outside_home_cont 0
avoid_touch_face 0
dr_recc_h1n1_vacc 0
dr_recc_seasonal_vacc 0
chronic_medic_condition 0
cont_child_undr_6_mnths 0
is_health_worker 0
has_health_insur 0
is_h1n1_vacc_effective 0
is_h1n1_risky 0
sick_from_h1n1_vacc 0
is_seas_vacc_effective 0
is_seas_risky 0
sick_from_seas_vacc 0
age_bracket 0
qualification 0
race 0
sex 0
income_level 0
marital_status 0
housing_status 0
employment 0
census_msa 0
no_of_adults 0
no_of_children 0


### APPLYING BINARY ENCODING

In [34]:
binEncoder = BinaryEncoder()
x = np.array(binEncoder.fit_transform(x))

### SPLITTING THE DATA INTO TEST, TRAIN

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

### FEATURE SCALING

In [36]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### TRAINING THE LOGISTIC REGRESSION MODEL

In [37]:
classifier = LogisticRegression(random_state = 0, max_iter=1000)
classifier.fit(x_train, y_train)

### CONFUSION MATRIX

In [38]:
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test, y_pred))

[[3931  241]
 [ 745  425]]


#### ACCURACY SCORE

In [39]:
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy score is {}%".format(round(acc,2)))

Accuracy score is 81.54%


### STRATIFIED K-FOLD VALIDATION

In [40]:
stfKF = StratifiedKFold(n_splits=5)
accuracy_score = cross_val_score(classifier, x,y, cv=stfKF)
acc_mean = accuracy_score.mean() * 100
acc_std = accuracy_score.std() * 100
print("Accuracy Score: {}%".format(round(acc_mean,2)))
print("Accuracy Deviation: {}%".format(round(acc_std,2)))

Accuracy Score: 80.43%
Accuracy Deviation: 1.02%


## TRYING VARIOUS FEATURE SELECTION METHODS

### RFE (Recursive Feature Elimination)

In [57]:
rfe = RFE(classifier, n_features_to_select=25)
rfe_x = rfe.fit_transform(x,y)
stfKF = StratifiedKFold(n_splits=5)
accuracy_score = cross_val_score(classifier, rfe_x, y, cv=stfKF)
acc_mean = accuracy_score.mean() * 100
print("Accuracy Score: {}%".format(round(acc_mean,2)))

Accuracy Score: 82.62%


### RFECV (Recursive Feature Elimination with Cross-Validation)

In [42]:
rfecv = RFECV(classifier, cv=stfKF, n_jobs=2)
rfecv_x = rfecv.fit_transform(x,y)
stfKF = StratifiedKFold(n_splits=5)
accuracy_score = cross_val_score(classifier, rfecv_x, y, cv=stfKF)
acc_mean = accuracy_score.mean() * 100
print("Accuracy Score: {}%".format(round(acc_mean,2)))

Accuracy Score: 82.57%


### SELECTKBEST

In [56]:
print(x.shape)
x_best = SelectKBest(chi2, k=30).fit_transform(x, y)
print(x_best.shape)
stfKF = StratifiedKFold(n_splits=5)
accuracy_score = cross_val_score(classifier, x_best, y, cv=stfKF)
acc_mean = accuracy_score.mean() * 100
print("Accuracy Score: {}%".format(round(acc_mean,2)))

(26707, 45)
(26707, 30)
Accuracy Score: 81.57%


### VARIANCE THRESHOLD

In [44]:
varThresh = VarianceThreshold(0.1)
x_threshold = varThresh.fit_transform(x)
stfKF = StratifiedKFold(n_splits=5)
accuracy_score = cross_val_score(classifier, x_threshold, y, cv=stfKF)
acc_mean = accuracy_score.mean() * 100
print("Accuracy Score: {}%".format(round(acc_mean,2)))

Accuracy Score: 80.54%
