In [16]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [17]:
# Loading data
file_path = Path("Deaths_Cleaned/ML_deaths.csv")
df_deaths = pd.read_csv(file_path)
df_deaths.head()

Unnamed: 0,Year_submitted,tot_death,conf_death,prob_death,new_death,pnew_death,state_AL,state_AZ,state_CA,state_CO,...,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan,2020_mean_deaths
0,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [18]:
# Define the features set.
X = df_deaths.copy()
X = X.drop("2020_mean_deaths", axis=1)
X.head()

Unnamed: 0,Year_submitted,tot_death,conf_death,prob_death,new_death,pnew_death,state_AL,state_AZ,state_CA,state_CO,...,state_SD,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan
0,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
# Define the target set.
y = df_deaths["2020_mean_deaths"].ravel()


In [21]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [22]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [24]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [25]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [26]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2338,0
Actual 1,0,2243


In [27]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [28]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2338,0
Actual 1,0,2243


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2338
           1       1.00      1.00      1.00      2243

    accuracy                           1.00      4581
   macro avg       1.00      1.00      1.00      4581
weighted avg       1.00      1.00      1.00      4581



In [29]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([3.37562571e-02, 4.77968319e-01, 2.91216266e-01, 1.10588125e-01,
       5.50049914e-02, 2.83080777e-02, 9.14304537e-05, 5.28018866e-05,
       5.43295004e-05, 6.89060933e-05, 4.80115449e-05, 6.47498769e-05,
       8.89208937e-05, 6.55441644e-05, 1.00235956e-04, 5.97194516e-05,
       1.03089930e-04, 4.46310081e-05, 6.37021471e-05, 5.45031156e-05,
       1.15148706e-04, 3.16084902e-05, 8.09498513e-05, 3.65405998e-05,
       5.11906946e-05, 5.15565943e-05, 7.82729456e-05, 4.44663720e-05,
       6.56263346e-05, 8.18938310e-05, 4.52252381e-05, 8.31026484e-05,
       3.25162898e-05, 6.07592797e-05, 1.27842599e-04, 8.95689701e-05,
       6.16208929e-05, 5.51275530e-05, 7.18862149e-05, 1.28350033e-04,
       7.67769358e-05, 6.84676933e-05, 7.52819580e-05, 5.97242798e-05,
       9.40120241e-05, 6.51922655e-05, 4.93802513e-05, 1.33148703e-04,
       6.45668674e-05, 8.47835744e-05, 3.27983935e-05])

In [30]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.47796831931694783, 'tot_death'),
 (0.2912162658814584, 'conf_death'),
 (0.11058812544671452, 'prob_death'),
 (0.05500499138314066, 'new_death'),
 (0.033756257142697206, 'Year_submitted'),
 (0.02830807772129694, 'pnew_death'),
 (0.00013314870326468236, 'state_WY'),
 (0.0001283500329750323, 'state_RMI'),
 (0.00012784259932615115, 'state_OH'),
 (0.00011514870606111874, 'state_MA'),
 (0.00010308993020514574, 'state_IN'),
 (0.00010023595603815971, 'state_ID'),
 (9.401202405440993e-05, 'state_VA'),
 (9.143045371553511e-05, 'state_AL'),
 (8.956897013690444e-05, 'state_OK'),
 (8.892089370707087e-05, 'state_FSM'),
 (8.478357444132103e-05, 'consent_deaths_Not agree'),
 (8.310264836790435e-05, 'state_NJ'),
 (8.18938309511728e-05, 'state_ND'),
 (8.094985134868767e-05, 'state_ME'),
 (7.827294563234455e-05, 'state_MS'),
 (7.67769357574453e-05, 'state_SC'),
 (7.52819580426819e-05, 'state_TN'),
 (7.188621494766786e-05, 'state_PR'),
 (6.890609327911603e-05, 'state_CO'),
 (6.846769330694103e-05, 'st