In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Cases_Cleaned/ML_cases.csv")
df_cases = pd.read_csv(file_path)
df_cases.head()

Unnamed: 0,Year_submitted,tot_cases,conf_cases,prob_cases,new_case,pnew_case,state_AL,state_AZ,state_CA,state_CO,...,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan,2020_mean_cases
0,2020,7,6,1,7,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [4]:
# Define the features set.
X = df_cases.copy()
X = X.drop("2020_mean_cases", axis=1)
X.head()

Unnamed: 0,Year_submitted,tot_cases,conf_cases,prob_cases,new_case,pnew_case,state_AL,state_AZ,state_CA,state_CO,...,state_SD,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan
0,2020,7,6,1,7,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
# Define the target set.
y = df_cases["2020_mean_cases"].ravel()


In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1971,0
Actual 1,0,2610


In [12]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [13]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1971,0
Actual 1,0,2610


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1971
           1       1.00      1.00      1.00      2610

    accuracy                           1.00      4581
   macro avg       1.00      1.00      1.00      4581
weighted avg       1.00      1.00      1.00      4581



In [14]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([1.04893579e-01, 4.79231873e-01, 2.36289455e-01, 7.13905897e-02,
       5.34091948e-02, 5.15583743e-02, 7.66645230e-05, 9.64509237e-05,
       5.17070785e-05, 5.88738843e-05, 8.11125832e-05, 9.11976139e-05,
       4.87970669e-05, 9.10511807e-05, 5.89970026e-05, 4.88937082e-05,
       6.47504311e-05, 8.37808731e-05, 2.54696259e-05, 4.58365103e-05,
       3.46945493e-05, 7.38540203e-05, 7.68470480e-05, 5.77578353e-05,
       5.87552912e-05, 7.18227394e-05, 6.01533777e-05, 1.03832429e-04,
       7.06113008e-05, 3.42062428e-05, 3.14124353e-05, 7.52939071e-05,
       5.15225103e-05, 9.82700253e-05, 5.71532884e-05, 9.03704297e-05,
       4.82211630e-05, 1.47113364e-04, 9.56497205e-05, 5.14883435e-05,
       6.54118847e-05, 8.68800966e-05, 6.85096813e-05, 6.60915874e-05,
       1.27046158e-04, 6.36481825e-05, 6.20799473e-05, 1.07927566e-04,
       1.01089627e-04, 8.67919361e-05, 7.88445799e-05])

In [15]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.4792318732910242, 'tot_cases'),
 (0.23628945515545272, 'conf_cases'),
 (0.10489357852140795, 'Year_submitted'),
 (0.07139058972806532, 'prob_cases'),
 (0.053409194763867136, 'new_case'),
 (0.05155837426690614, 'pnew_case'),
 (0.00014711336446174568, 'state_PA'),
 (0.0001270461579345981, 'state_VA'),
 (0.0001079275662316783, 'state_WY'),
 (0.00010383242883228383, 'state_MT'),
 (0.00010108962656808775, 'consent_cases_Agree'),
 (9.827002525628206e-05, 'state_NYC'),
 (9.645092366491074e-05, 'state_AZ'),
 (9.564972050781565e-05, 'state_PR'),
 (9.119761391628234e-05, 'state_DE'),
 (9.105118067065791e-05, 'state_GA'),
 (9.037042971618824e-05, 'state_OK'),
 (8.688009663511904e-05, 'state_SD'),
 (8.679193607040499e-05, 'consent_cases_Not agree'),
 (8.37808730631229e-05, 'state_KS'),
 (8.111258319202853e-05, 'state_CT'),
 (7.884457987320679e-05, 'consent_cases_nan'),
 (7.684704798748041e-05, 'state_ME'),
 (7.666452304962755e-05, 'state_AL'),
 (7.52939071416236e-05, 'state_NJ'),
 (7.385402032