# Read the CSV and Perform Basic Data Cleaning

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn import preprocessing
from sklearn import utils

In [3]:
df = pd.read_csv(Path('./Machine_Learning/vaccine.csv', index_col = False))
df.head()

Unnamed: 0,state_id,date,Vaccination_pct,new_cases_per_100k,hospital_beds_per_100k,deaths_per_100k
0,AK,3/1/2021,0,52.87448,3.451337,1.380535
1,AK,3/2/2021,0,21.260235,3.727444,0.138053
2,AK,3/3/2021,0,26.368213,4.693818,0.138053
3,AK,3/4/2021,0,20.431914,4.417711,0.0
4,AK,3/5/2021,15,20.431914,4.969925,0.138053


In [4]:
df.dtypes

state_id                   object
date                       object
Vaccination_pct             int64
new_cases_per_100k        float64
hospital_beds_per_100k    float64
deaths_per_100k           float64
dtype: object

In [5]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

In [6]:
# remove State
df = df.drop('state_id',1)
df.head()

Unnamed: 0,date,Vaccination_pct,new_cases_per_100k,hospital_beds_per_100k,deaths_per_100k
0,3/1/2021,0,52.87448,3.451337,1.380535
1,3/2/2021,0,21.260235,3.727444,0.138053
2,3/3/2021,0,26.368213,4.693818,0.138053
3,3/4/2021,0,20.431914,4.417711,0.0
4,3/5/2021,15,20.431914,4.969925,0.138053


# Split the Data into Training and Testing

In [9]:
# Create our features
X = df.drop("Vaccination_pct", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df["Vaccination_pct"]

In [10]:
X.describe()

Unnamed: 0,new_cases_per_100k,hospital_beds_per_100k,deaths_per_100k,date_1/1/2022,date_1/10/2022,date_1/11/2022,date_1/12/2022,date_1/13/2022,date_1/14/2022,date_1/15/2022,...,date_9/28/2021,date_9/29/2021,date_9/3/2021,date_9/30/2021,date_9/4/2021,date_9/5/2021,date_9/6/2021,date_9/7/2021,date_9/8/2021,date_9/9/2021
count,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,...,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0,18200.0
mean,42.178086,17.563201,0.343863,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747,...,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747,0.002747
std,82.97521,13.861234,0.62861,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344,...,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344,0.052344
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.239575,7.143247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,16.006992,13.171732,0.131842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,43.436851,24.749799,0.41416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1528.767067,79.199522,13.939681,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# Check the balance of our target values
y.value_counts()

0     1278
53     591
54     586
52     532
50     501
      ... 
8       38
79      32
81       6
7        5
6        1
Name: Vaccination_pct, Length: 77, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Ensemble Learners

In [13]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [14]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_predicted = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_predicted)

0.04250444071528667

In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predicted)

array([[ 6,  3, 36, ..., 15,  6,  5],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  2,  0,  0],
       [ 0,  0,  0, ...,  2,  0,  2],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_predicted))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.12      0.02      0.99      0.03      0.14      0.02       302
          6       0.00      0.00      0.99      0.00      0.00      0.00         0
          7       0.00      0.00      0.93      0.00      0.00      0.00         2
          8       0.01      0.20      0.98      0.02      0.44      0.18         5
          9       0.12      0.21      0.99      0.16      0.46      0.19        33
         10       0.03      0.13      0.97      0.05      0.35      0.11        39
         11       0.03      0.07      0.99      0.04      0.26      0.06        29
         12       0.14      0.26      0.99      0.18      0.50      0.24        35
         13       0.06      0.20      0.97      0.10      0.44      0.18        41
         14       0.02      0.07      0.97      0.03      0.27      0.07        40
         15       0.07      0.03      1.00      0.04      0.18      0.03        31
   

In [17]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.1119510732100899, 'hospital_beds_per_100k'),
 (0.10286883137433778, 'new_cases_per_100k'),
 (0.09045432704563136, 'deaths_per_100k'),
 (0.014024143495649332, 'date_3/5/2021'),
 (0.008279184358441452, 'date_3/6/2021'),
 (0.006426658585259785, 'date_4/12/2021'),
 (0.006000580014895001, 'date_3/9/2021'),
 (0.005406570219336223, 'date_3/14/2021'),
 (0.005017187104359234, 'date_4/10/2021'),
 (0.004878617825024179, 'date_2/26/2022'),
 (0.004857483505718967, 'date_4/14/2021'),
 (0.004769390955341864, 'date_3/7/2021'),
 (0.004701030324633909, 'date_4/13/2021'),
 (0.004695583842480251, 'date_2/25/2022'),
 (0.004586652945578735, 'date_3/26/2021'),
 (0.004529845494455996, 'date_4/18/2021'),
 (0.004391962957927534, 'date_4/7/2021'),
 (0.004279037581647624, 'date_5/7/2021'),
 (0.0041815227295926745, 'date_3/11/2021'),
 (0.0041236219236206635, 'date_4/24/2021'),
 (0.004104790190199222, 'date_3/25/2021'),
 (0.003964946115237974, 'date_4/25/2021'),
 (0.0038609940450795447, 'date_4/16/2021'),
 (0.0

# Easy Ensemble AdaBoost Classifier

In [18]:
# Train the EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [19]:
# Calculated the balanced accuracy score
y_predicted = easy.predict(X_test)
balanced_accuracy_score(y_test, y_predicted)

0.023292894501138976

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_predicted)

array([[ 0, 10,  3, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_predicted))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       302
          6       0.00      0.00      0.98      0.00      0.00      0.00         0
          7       0.00      0.00      1.00      0.00      0.00      0.00         2
          8       0.10      0.20      1.00      0.13      0.45      0.18         5
          9       0.00      0.00      1.00      0.00      0.00      0.00        33
         10       0.00      0.00      1.00      0.00      0.00      0.00        39
         11       0.00      0.00      1.00      0.00      0.00      0.00        29
         12       0.00      0.00      1.00      0.00      0.00      0.00        35
         13       0.00      0.00      1.00      0.00      0.00      0.00        41
         14       0.00      0.00      1.00      0.00      0.00      0.00        40
         15       0.00      0.00      1.00      0.00      0.00      0.00        31
   