# Read the CSV and Perform Basic Data Cleaning

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn import preprocessing
from sklearn import utils

In [22]:
df = pd.read_csv(Path('./Machine_Learning/ml_table.csv', index_col = False))
df.head()

Unnamed: 0,state_id,date,new_cases,vaccinated_ratio,hospital_beds,new_deaths,infection_rate,population,requirements
0,AK,2021-03-01,383,0.0,25,10,0.99,724357,f
1,AK,2021-03-02,154,0.0,27,1,0.99,724357,f
2,AK,2021-03-03,191,0.0,34,1,0.99,724357,f
3,AK,2021-03-04,148,0.0,32,0,0.99,724357,f
4,AK,2021-03-05,148,0.146,36,1,0.99,724357,f


In [23]:
df.dtypes

state_id             object
date                 object
new_cases             int64
vaccinated_ratio    float64
hospital_beds         int64
new_deaths            int64
infection_rate      float64
population            int64
requirements         object
dtype: object

In [5]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

In [6]:
# remove State
df = df.drop('States',1)
df.head()

Unnamed: 0,Date,Req,vaccinated_pct,monthly_new_cases_per_100k,infection_rate,hospital_beds_per_100k,new_deaths_per_100K
0,2021-Mar-01,0,0,650.646021,1.08,6.212406,2.899123
1,2021-Apr-01,0,23,695.651454,1.07,8.283208,1.794695
2,2021-May-01,0,36,280.110498,0.85,8.697369,3.037176
3,2021-Jun-01,0,39,143.85172,1.17,3.17523,1.104428
4,2021-Jul-01,0,43,577.615734,1.4,15.600043,1.656642


# Split the Data into Training and Testing

In [9]:
# Create our features
X = df.drop("Req", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df["Req"]

In [10]:
X.describe()

Unnamed: 0,vaccinated_pct,monthly_new_cases_per_100k,infection_rate,hospital_beds_per_100k,new_deaths_per_100K,Date_2021-Apr-01,Date_2021-Aug-01,Date_2021-Dec-01,Date_2021-Jul-01,Date_2021-Jun-01,Date_2021-Mar-01,Date_2021-May-01,Date_2021-Nov-01,Date_2021-Oct-01,Date_2021-Sep-01,Date_2022-Feb-01,Date_2022-Jan-01
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,41.135,1279.401952,1.115467,22.943839,10.430525,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333
std,21.601647,1598.200066,0.210117,16.398961,8.354288,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616,0.276616
min,0.0,30.48531,0.6,0.0,0.160449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,332.114629,0.98,9.997997,3.860593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,47.0,768.965565,1.09,17.73416,7.622336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,56.0,1403.495785,1.29,32.834167,15.07746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,79.0,10615.35983,1.77,79.199522,47.610852,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# Check the balance of our target values
y.value_counts()

0    444
1    156
Name: Req, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Ensemble Learners

In [13]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [14]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_predicted = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_predicted)

0.501177394034537

In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predicted)

array([[53, 45],
       [28, 24]], dtype=int64)

In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_predicted))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.65      0.54      0.46      0.59      0.50      0.25        98
          1       0.35      0.46      0.54      0.40      0.50      0.25        52

avg / total       0.55      0.51      0.49      0.52      0.50      0.25       150



In [17]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.19547926361960866, 'new_deaths_per_100K'),
 (0.19374792921937092, 'monthly_new_cases_per_100k'),
 (0.189008851845379, 'hospital_beds_per_100k'),
 (0.15942982693912686, ' infection_rate'),
 (0.14918904685251608, 'vaccinated_pct'),
 (0.01271865667522483, 'Date_2021-Nov-01'),
 (0.012039257397881297, 'Date_2021-Oct-01'),
 (0.010615285148235596, 'Date_2021-May-01'),
 (0.010406796435296427, 'Date_2021-Aug-01'),
 (0.009700900615472791, 'Date_2021-Dec-01'),
 (0.009086629362460717, 'Date_2021-Mar-01'),
 (0.009057702303335533, 'Date_2021-Sep-01'),
 (0.008624903778255278, 'Date_2021-Apr-01'),
 (0.008544321637480047, 'Date_2021-Jul-01'),
 (0.007963303970129882, 'Date_2021-Jun-01'),
 (0.0077659587745195395, 'Date_2022-Feb-01'),
 (0.006621365425706461, 'Date_2022-Jan-01')]

# Easy Ensemble AdaBoost Classifier

In [18]:
# Train the EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [19]:
# Calculated the balanced accuracy score
y_predicted = easy.predict(X_test)
balanced_accuracy_score(y_test, y_predicted)

0.5622056514913658

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_predicted)

array([[48, 50],
       [19, 33]], dtype=int64)

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_predicted))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.49      0.63      0.58      0.56      0.31        98
          1       0.40      0.63      0.49      0.49      0.56      0.32        52

avg / total       0.61      0.54      0.58      0.55      0.56      0.31       150

