## This Notebook:

1) links to the AWS database

2) cleans the AWS database

3) creates a local path for the output Cases_Cleaned/ML_cases.csv

4) creates a local path for the output Deaths_Cleaned/ML_deaths.csv

5) ML model for cases reads ML_cases.csv

6) ML model for cases saves output to user-defined location

5) ML model for deaths reads ML_deaths.csv

6) ML model for deaths saves output to user-defined location


## AWS db cleaner

FILE:  United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv

SOURCE:  AWS download from SQL database

**RELEVANT DATAFRAMES:  df, df_cases, df_deaths**

In [1]:
#Import dependencies

import pandas as pd

import re


### **Step 1:**  

Read AWS file into Pandas

In [2]:
# read the file

file_path = "https://initial-datasets.s3.amazonaws.com/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,1/22/20,AK,-,,,-,,-,,,-,,3/26/20 16:22,,
1,1/23/20,AK,-,,,-,,-,,,-,,3/26/20 16:22,,
2,1/24/20,AK,-,,,-,,-,,,-,,3/26/20 16:22,,
3,1/25/20,AK,-,,,-,,-,,,-,,3/26/20 16:22,,
4,1/26/20,AK,-,,,-,,-,,,-,,3/26/20 16:22,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38755,10/24/21,WY,100174,80401,19773,-,-,1149,1149,-,-,-,10/25/21 17:00,Agree,Agree
38756,10/25/21,WY,101083,81127,19956,909,183,1149,1149,-,-,-,10/26/21 13:58,Agree,Agree
38757,10/26/21,WY,101424,81337,20087,341,131,1174,1174,-,25,-,10/27/21 14:21,Agree,Agree
38758,10/27/21,WY,101912,81641,20271,488,184,1174,1174,-,-,-,10/28/21 14:08,Agree,Agree


In [3]:
df.columns

Index(['submission_date', 'state', 'tot_cases', 'conf_cases', 'prob_cases',
       'new_case', 'pnew_case', 'tot_death', 'conf_death', 'prob_death',
       'new_death', 'pnew_death', 'created_at', 'consent_cases',
       'consent_deaths'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38760 entries, 0 to 38759
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   submission_date  38760 non-null  object
 1   state            38760 non-null  object
 2   tot_cases        38760 non-null  object
 3   conf_cases       20510 non-null  object
 4   prob_cases       20438 non-null  object
 5   new_case         38760 non-null  object
 6   pnew_case        34756 non-null  object
 7   tot_death        38760 non-null  object
 8   conf_death       20379 non-null  object
 9   prob_death       20379 non-null  object
 10  new_death        38760 non-null  object
 11  pnew_death       34695 non-null  object
 12  created_at       38760 non-null  object
 13  consent_cases    32295 non-null  object
 14  consent_deaths   32946 non-null  object
dtypes: object(15)
memory usage: 4.4+ MB


### **Step 2:**  
    
Transform columns into integers

In [5]:
to_num_list = ["tot_cases", "conf_cases", "prob_cases", "new_case", "pnew_case", 
              "tot_death", "conf_death", "prob_death", "new_death", "pnew_death"]

for x in to_num_list:
    df[x] = df[x].str.replace("-", "0", regex = True)

df.info()






<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38760 entries, 0 to 38759
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   submission_date  38760 non-null  object
 1   state            38760 non-null  object
 2   tot_cases        38760 non-null  object
 3   conf_cases       20510 non-null  object
 4   prob_cases       20438 non-null  object
 5   new_case         38760 non-null  object
 6   pnew_case        34756 non-null  object
 7   tot_death        38760 non-null  object
 8   conf_death       20379 non-null  object
 9   prob_death       20379 non-null  object
 10  new_death        38760 non-null  object
 11  pnew_death       34695 non-null  object
 12  created_at       38760 non-null  object
 13  consent_cases    32295 non-null  object
 14  consent_deaths   32946 non-null  object
dtypes: object(15)
memory usage: 4.4+ MB


In [6]:

for x in to_num_list:
    df[x] = df[x].str.replace( '    ', "0", regex = True)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38760 entries, 0 to 38759
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   submission_date  38760 non-null  object
 1   state            38760 non-null  object
 2   tot_cases        38760 non-null  object
 3   conf_cases       20510 non-null  object
 4   prob_cases       20438 non-null  object
 5   new_case         38760 non-null  object
 6   pnew_case        34756 non-null  object
 7   tot_death        38760 non-null  object
 8   conf_death       20379 non-null  object
 9   prob_death       20379 non-null  object
 10  new_death        38760 non-null  object
 11  pnew_death       34695 non-null  object
 12  created_at       38760 non-null  object
 13  consent_cases    32295 non-null  object
 14  consent_deaths   32946 non-null  object
dtypes: object(15)
memory usage: 4.4+ MB


In [7]:
for x in to_num_list:
    df[x] = df[x].str.replace(",", "", regex = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38760 entries, 0 to 38759
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   submission_date  38760 non-null  object
 1   state            38760 non-null  object
 2   tot_cases        38760 non-null  object
 3   conf_cases       20510 non-null  object
 4   prob_cases       20438 non-null  object
 5   new_case         38760 non-null  object
 6   pnew_case        34756 non-null  object
 7   tot_death        38760 non-null  object
 8   conf_death       20379 non-null  object
 9   prob_death       20379 non-null  object
 10  new_death        38760 non-null  object
 11  pnew_death       34695 non-null  object
 12  created_at       38760 non-null  object
 13  consent_cases    32295 non-null  object
 14  consent_deaths   32946 non-null  object
dtypes: object(15)
memory usage: 4.4+ MB


In [8]:
df

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,1/22/20,AK,0,,,0,,0,,,0,,3/26/20 16:22,,
1,1/23/20,AK,0,,,0,,0,,,0,,3/26/20 16:22,,
2,1/24/20,AK,0,,,0,,0,,,0,,3/26/20 16:22,,
3,1/25/20,AK,0,,,0,,0,,,0,,3/26/20 16:22,,
4,1/26/20,AK,0,,,0,,0,,,0,,3/26/20 16:22,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38755,10/24/21,WY,100174,80401,19773,0,0,1149,1149,0,0,0,10/25/21 17:00,Agree,Agree
38756,10/25/21,WY,101083,81127,19956,909,183,1149,1149,0,0,0,10/26/21 13:58,Agree,Agree
38757,10/26/21,WY,101424,81337,20087,341,131,1174,1174,0,25,0,10/27/21 14:21,Agree,Agree
38758,10/27/21,WY,101912,81641,20271,488,184,1174,1174,0,0,0,10/28/21 14:08,Agree,Agree


In [9]:

for x in to_num_list:
    df[x] = df[x].fillna(0)

df

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,1/22/20,AK,0,0,0,0,0,0,0,0,0,0,3/26/20 16:22,,
1,1/23/20,AK,0,0,0,0,0,0,0,0,0,0,3/26/20 16:22,,
2,1/24/20,AK,0,0,0,0,0,0,0,0,0,0,3/26/20 16:22,,
3,1/25/20,AK,0,0,0,0,0,0,0,0,0,0,3/26/20 16:22,,
4,1/26/20,AK,0,0,0,0,0,0,0,0,0,0,3/26/20 16:22,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38755,10/24/21,WY,100174,80401,19773,0,0,1149,1149,0,0,0,10/25/21 17:00,Agree,Agree
38756,10/25/21,WY,101083,81127,19956,909,183,1149,1149,0,0,0,10/26/21 13:58,Agree,Agree
38757,10/26/21,WY,101424,81337,20087,341,131,1174,1174,0,25,0,10/27/21 14:21,Agree,Agree
38758,10/27/21,WY,101912,81641,20271,488,184,1174,1174,0,0,0,10/28/21 14:08,Agree,Agree


In [10]:

for x in to_num_list:
    df.drop(df[df[x] ==' (400)'].index)


In [11]:
for x in to_num_list:
    df[x] = pd.to_numeric(df[x], errors="coerce")

In [12]:

for x in to_num_list:
    df[x] = df[x].fillna(0)

df

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,1/22/20,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
1,1/23/20,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
2,1/24/20,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
3,1/25/20,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
4,1/26/20,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38755,10/24/21,WY,100174,80401,19773,0.0,0.0,1149,1149,0,0.0,0.0,10/25/21 17:00,Agree,Agree
38756,10/25/21,WY,101083,81127,19956,909.0,183.0,1149,1149,0,0.0,0.0,10/26/21 13:58,Agree,Agree
38757,10/26/21,WY,101424,81337,20087,341.0,131.0,1174,1174,0,25.0,0.0,10/27/21 14:21,Agree,Agree
38758,10/27/21,WY,101912,81641,20271,488.0,184.0,1174,1174,0,0.0,0.0,10/28/21 14:08,Agree,Agree


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38760 entries, 0 to 38759
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   submission_date  38760 non-null  object 
 1   state            38760 non-null  object 
 2   tot_cases        38760 non-null  int64  
 3   conf_cases       38760 non-null  int64  
 4   prob_cases       38760 non-null  int64  
 5   new_case         38760 non-null  float64
 6   pnew_case        38760 non-null  float64
 7   tot_death        38760 non-null  int64  
 8   conf_death       38760 non-null  int64  
 9   prob_death       38760 non-null  int64  
 10  new_death        38760 non-null  float64
 11  pnew_death       38760 non-null  float64
 12  created_at       38760 non-null  object 
 13  consent_cases    32295 non-null  object 
 14  consent_deaths   32946 non-null  object 
dtypes: float64(4), int64(6), object(5)
memory usage: 4.4+ MB


In [14]:

df["state"] = df["state"].astype(str)
df["state"].dtypes

dtype('O')

In [15]:
# Change column "submission" dtype to datetime format.

df["submission_date"] = pd.to_datetime(df["submission_date"])
df.head()

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,2020-01-22,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
1,2020-01-23,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
2,2020-01-24,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
3,2020-01-25,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
4,2020-01-26,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,


In [16]:
# Place column "submission" in increasing order AND sort by "state" also.

df = df.sort_values(by = ["submission_date", "state"])
df.head(200)

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,2020-01-22,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
646,2020-01-22,AL,7,6,1,7.0,1.0,0,0,0,0.0,0.0,1/24/20 0:00,Agree,Agree
1292,2020-01-22,AR,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,Not agree,Not agree
1938,2020-01-22,AS,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,,
2584,2020-01-22,AZ,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,Agree,Agree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9693,2020-01-25,IA,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,Not agree,Not agree
10339,2020-01-25,ID,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,Agree,Agree
10985,2020-01-25,IL,1,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,Agree,Agree
11631,2020-01-25,IN,0,0,0,0.0,0.0,0,0,0,0.0,0.0,3/26/20 16:22,Not agree,Agree


In [17]:
# Delete column "created_at".

df.drop(columns=["created_at"], inplace = True)
df.head()

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,consent_cases,consent_deaths
0,2020-01-22,AK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,,
646,2020-01-22,AL,7,6,1,7.0,1.0,0,0,0,0.0,0.0,Agree,Agree
1292,2020-01-22,AR,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Not agree,Not agree
1938,2020-01-22,AS,0,0,0,0.0,0.0,0,0,0,0.0,0.0,,
2584,2020-01-22,AZ,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree


In [18]:
# Keep row if value of either "consent_cases" or "consent_deaths" is Agree or Not Agree.
df = df.loc[(df["consent_cases"]=="Agree")|(df["consent_deaths"]=="Agree") ]
df.head(200)

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,consent_cases,consent_deaths
646,2020-01-22,AL,7,6,1,7.0,1.0,0,0,0,0.0,0.0,Agree,Agree
2584,2020-01-22,AZ,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree
3230,2020-01-22,CA,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Not agree
3876,2020-01-22,CO,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree
4522,2020-01-22,CT,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25198,2020-01-26,NYC,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree
25844,2020-01-26,OH,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree
26490,2020-01-26,OK,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Not agree,Agree
27136,2020-01-26,OR,0,0,0,0.0,0.0,0,0,0,0.0,0.0,Agree,Agree


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26575 entries, 646 to 38759
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   submission_date  26575 non-null  datetime64[ns]
 1   state            26575 non-null  object        
 2   tot_cases        26575 non-null  int64         
 3   conf_cases       26575 non-null  int64         
 4   prob_cases       26575 non-null  int64         
 5   new_case         26575 non-null  float64       
 6   pnew_case        26575 non-null  float64       
 7   tot_death        26575 non-null  int64         
 8   conf_death       26575 non-null  int64         
 9   prob_death       26575 non-null  int64         
 10  new_death        26575 non-null  float64       
 11  pnew_death       26575 non-null  float64       
 12  consent_cases    24637 non-null  object        
 13  consent_deaths   25288 non-null  object        
dtypes: datetime64[ns](1), float64(4), in

In [20]:
# For columns "tot_cases", "conf_cases", "prob_cases", "new_case", "pnew_case", "tot_death", "conf_death",
# "prob_death", "new_death", "pnew_death", change dtype to integer.


col_headers = ["tot_cases", "conf_cases", "prob_cases", "new_case", "pnew_case", "tot_death", "conf_death",
"prob_death", "new_death", "pnew_death"]

for col in col_headers:
    df[col]=df[col].astype("int64")

df.head(200)

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,consent_cases,consent_deaths
646,2020-01-22,AL,7,6,1,7,1,0,0,0,0,0,Agree,Agree
2584,2020-01-22,AZ,0,0,0,0,0,0,0,0,0,0,Agree,Agree
3230,2020-01-22,CA,0,0,0,0,0,0,0,0,0,0,Agree,Not agree
3876,2020-01-22,CO,0,0,0,0,0,0,0,0,0,0,Agree,Agree
4522,2020-01-22,CT,0,0,0,0,0,0,0,0,0,0,Agree,Agree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25198,2020-01-26,NYC,0,0,0,0,0,0,0,0,0,0,Agree,Agree
25844,2020-01-26,OH,0,0,0,0,0,0,0,0,0,0,Agree,Agree
26490,2020-01-26,OK,0,0,0,0,0,0,0,0,0,0,Not agree,Agree
27136,2020-01-26,OR,0,0,0,0,0,0,0,0,0,0,Agree,Agree


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26575 entries, 646 to 38759
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   submission_date  26575 non-null  datetime64[ns]
 1   state            26575 non-null  object        
 2   tot_cases        26575 non-null  int64         
 3   conf_cases       26575 non-null  int64         
 4   prob_cases       26575 non-null  int64         
 5   new_case         26575 non-null  int64         
 6   pnew_case        26575 non-null  int64         
 7   tot_death        26575 non-null  int64         
 8   conf_death       26575 non-null  int64         
 9   prob_death       26575 non-null  int64         
 10  new_death        26575 non-null  int64         
 11  pnew_death       26575 non-null  int64         
 12  consent_cases    24637 non-null  object        
 13  consent_deaths   25288 non-null  object        
dtypes: datetime64[ns](1), int64(10), obj

### **Step 3:**  

Transform datetime to year only

In [22]:
#  Assume that only the year is needed.  Extract the year and place it in a column "Year_submitted."  

df["Year_submitted"] = df["submission_date"].dt.year
df.head()


Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,consent_cases,consent_deaths,Year_submitted
646,2020-01-22,AL,7,6,1,7,1,0,0,0,0,0,Agree,Agree,2020
2584,2020-01-22,AZ,0,0,0,0,0,0,0,0,0,0,Agree,Agree,2020
3230,2020-01-22,CA,0,0,0,0,0,0,0,0,0,0,Agree,Not agree,2020
3876,2020-01-22,CO,0,0,0,0,0,0,0,0,0,0,Agree,Agree,2020
4522,2020-01-22,CT,0,0,0,0,0,0,0,0,0,0,Agree,Agree,2020


In [23]:
df.columns

Index(['submission_date', 'state', 'tot_cases', 'conf_cases', 'prob_cases',
       'new_case', 'pnew_case', 'tot_death', 'conf_death', 'prob_death',
       'new_death', 'pnew_death', 'consent_cases', 'consent_deaths',
       'Year_submitted'],
      dtype='object')

In [24]:
# Delete the column "submission_date."

df.drop(columns = ["submission_date"], inplace = True)

In [25]:
# reorder columns

df_columns_new = [
'Year_submitted',
'state',
'tot_cases', 
'conf_cases', 
'prob_cases',
'new_case', 
'pnew_case', 
'tot_death', 
'conf_death', 
'prob_death',
'new_death', 
'pnew_death', 
'consent_cases',
'consent_deaths'
 ]



In [26]:
len(df_columns_new)

14

In [27]:
df = df.reindex(columns = df_columns_new)
df.head()

Unnamed: 0,Year_submitted,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,consent_cases,consent_deaths
646,2020,AL,7,6,1,7,1,0,0,0,0,0,Agree,Agree
2584,2020,AZ,0,0,0,0,0,0,0,0,0,0,Agree,Agree
3230,2020,CA,0,0,0,0,0,0,0,0,0,0,Agree,Not agree
3876,2020,CO,0,0,0,0,0,0,0,0,0,0,Agree,Agree
4522,2020,CT,0,0,0,0,0,0,0,0,0,0,Agree,Agree


In [28]:
df.shape

(26575, 14)

### **Step 4:** 

Calculate means for number of cases and number of deaths and make the target columns

In [29]:
# Add the target columns to df.

df["2020_mean_cases"] = 0
df["2020_mean_deaths"] = 0


In [30]:
# Perform a describe() on column "tot_cases" over the year 2020 only for all states.  The results apply to 
# the population of states

df_cases_2020 = df.loc[df["Year_submitted"]==2020]
mean_cases = df_cases_2020["tot_cases"].mean()
mean_cases



90135.79939381123

In [31]:
# Perform a describe() on column "tot_death" over the year 2020 only for all states.  The results apply to the
#population of states.

df_deaths_2020 = df.loc[df["Year_submitted"]==2020]
mean_deaths = df_deaths_2020["tot_death"].mean()
mean_deaths



2634.1879890040177

In [32]:
# Populate "2020_mean_cases" with 1 or 0 
# Populate "2020_mean_deaths" with 1 or 0 

# cases

for index, row in df.iterrows():
    x = row["tot_cases"]
    if x >= int(mean_cases):
        df.loc[index, "2020_mean_cases"]=1
    else:
        df.loc[index, "2020_mean_cases"]=0

print(df["2020_mean_cases"].value_counts())

# deaths

for index, row in df.iterrows():
    x = row["tot_death"]
    if x >= int(mean_deaths):
        df.loc[index, "2020_mean_deaths"]=1
    else:
        df.loc[index, "2020_mean_deaths"]=0

print(df["2020_mean_deaths"].value_counts())


1    15156
0    11419
Name: 2020_mean_cases, dtype: int64
0    13647
1    12928
Name: 2020_mean_deaths, dtype: int64


In [33]:
df.head()

Unnamed: 0,Year_submitted,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,consent_cases,consent_deaths,2020_mean_cases,2020_mean_deaths
646,2020,AL,7,6,1,7,1,0,0,0,0,0,Agree,Agree,0,0
2584,2020,AZ,0,0,0,0,0,0,0,0,0,0,Agree,Agree,0,0
3230,2020,CA,0,0,0,0,0,0,0,0,0,0,Agree,Not agree,0,0
3876,2020,CO,0,0,0,0,0,0,0,0,0,0,Agree,Agree,0,0
4522,2020,CT,0,0,0,0,0,0,0,0,0,0,Agree,Agree,0,0


### **Step 5:** 

Perform OneHotEncoding on object columns

In [34]:
# import dependencies

from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26575 entries, 646 to 38759
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Year_submitted    26575 non-null  int64 
 1   state             26575 non-null  object
 2   tot_cases         26575 non-null  int64 
 3   conf_cases        26575 non-null  int64 
 4   prob_cases        26575 non-null  int64 
 5   new_case          26575 non-null  int64 
 6   pnew_case         26575 non-null  int64 
 7   tot_death         26575 non-null  int64 
 8   conf_death        26575 non-null  int64 
 9   prob_death        26575 non-null  int64 
 10  new_death         26575 non-null  int64 
 11  pnew_death        26575 non-null  int64 
 12  consent_cases     24637 non-null  object
 13  consent_deaths    25288 non-null  object
 14  2020_mean_cases   26575 non-null  int64 
 15  2020_mean_deaths  26575 non-null  int64 
dtypes: int64(13), object(3)
memory usage: 4.5+ MB


In [36]:
obj_list = df.dtypes[df.dtypes == "object"].index.to_list()
obj_list

['state', 'consent_cases', 'consent_deaths']

In [37]:
# Apply OneHotEncoder to THREE columns:  "consent_cases", "consent_deaths", and "state."

enc = OneHotEncoder(sparse = False)
encoded_df = pd.DataFrame(enc.fit_transform(df[obj_list]))
encoded_df.columns = enc.get_feature_names(obj_list)
encoded_df.head(200)

Unnamed: 0,state_AL,state_AZ,state_CA,state_CO,state_CT,state_DE,state_FSM,state_GA,state_ID,state_IL,...,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [38]:
df = df.merge(encoded_df, left_index = True, right_index = True)
df = df.drop(obj_list, 1)
df.head(200)

Unnamed: 0,Year_submitted,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,...,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan
646,2020,7,6,1,7,1,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2584,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3230,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3876,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4522,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14864,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
15510,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
16156,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
16802,2020,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [39]:
df.columns

Index(['Year_submitted', 'tot_cases', 'conf_cases', 'prob_cases', 'new_case',
       'pnew_case', 'tot_death', 'conf_death', 'prob_death', 'new_death',
       'pnew_death', '2020_mean_cases', '2020_mean_deaths', 'state_AL',
       'state_AZ', 'state_CA', 'state_CO', 'state_CT', 'state_DE', 'state_FSM',
       'state_GA', 'state_ID', 'state_IL', 'state_IN', 'state_KS', 'state_KY',
       'state_LA', 'state_MA', 'state_MD', 'state_ME', 'state_MI', 'state_MN',
       'state_MP', 'state_MS', 'state_MT', 'state_NC', 'state_ND', 'state_NE',
       'state_NJ', 'state_NV', 'state_NYC', 'state_OH', 'state_OK', 'state_OR',
       'state_PA', 'state_PR', 'state_RMI', 'state_SC', 'state_SD', 'state_TN',
       'state_UT', 'state_VA', 'state_WI', 'state_WV', 'state_WY',
       'consent_cases_Agree', 'consent_cases_Not agree', 'consent_cases_nan',
       'consent_deaths_Agree', 'consent_deaths_Not agree',
       'consent_deaths_nan'],
      dtype='object')

### **Step 6:** 

Make dataframes for cases and deaths

In [40]:
#. Make a new dataframe for cases only.

columns_cases = [
'Year_submitted',
'tot_cases',
 'conf_cases',
 'prob_cases',
 'new_case',
 'pnew_case',
 'state_AL',
 'state_AZ',
 'state_CA',
 'state_CO',
 'state_CT',
 'state_DE',
 'state_FSM',
 'state_GA',
 'state_ID',
 'state_IL',
 'state_IN',
 'state_KS',
 'state_KY',
 'state_LA',
 'state_MA',
 'state_MD',
 'state_ME',
 'state_MI',
 'state_MN',
 'state_MP',
 'state_MS',
 'state_MT',
 'state_NC',
 'state_ND',
 'state_NE',
 'state_NJ',
 'state_NV',
 'state_NYC',
 'state_OH',
 'state_OK',
 'state_OR',
 'state_PA',
 'state_PR',
 'state_RMI',
 'state_SC',
 'state_SD',
 'state_TN',
 'state_UT',
 'state_VA',
 'state_WI',
 'state_WV',
 'state_WY',
 'consent_cases_Agree',
 'consent_cases_Not agree',
 'consent_cases_nan',
 '2020_mean_cases'
 ]

df_cases = df.copy()
df_cases.drop(columns = ['tot_death','conf_death','prob_death','new_death','pnew_death','consent_deaths_Agree',
 'consent_deaths_Not agree','consent_deaths_nan'], inplace = True)

df_cases = df_cases[columns_cases]
df_cases.head(200)

Unnamed: 0,Year_submitted,tot_cases,conf_cases,prob_cases,new_case,pnew_case,state_AL,state_AZ,state_CA,state_CO,...,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan,2020_mean_cases
646,2020,7,6,1,7,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2584,2020,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3230,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3876,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4522,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14864,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
15510,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
16156,2020,0,0,0,0,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
16802,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [41]:
df_cases.shape

(18266, 52)

In [42]:
# Make a dataframe for deaths only

columns_deaths = [
'Year_submitted',
 'tot_death',
 'conf_death',
 'prob_death',
 'new_death',
 'pnew_death',
 'state_AL',
 'state_AZ',
 'state_CA',
 'state_CO',
 'state_CT',
 'state_DE',
 'state_FSM',
 'state_GA',
 'state_ID',
 'state_IL',
 'state_IN',
 'state_KS',
 'state_KY',
 'state_LA',
 'state_MA',
 'state_MD',
 'state_ME',
 'state_MI',
 'state_MN',
 'state_MP',
 'state_MS',
 'state_MT',
 'state_NC',
 'state_ND',
 'state_NE',
 'state_NJ',
 'state_NV',
 'state_NYC',
 'state_OH',
 'state_OK',
 'state_OR',
 'state_PA',
 'state_PR',
 'state_RMI',
 'state_SC',
 'state_SD',
 'state_TN',
 'state_UT',
 'state_VA',
 'state_WI',
 'state_WV',
 'state_WY',
 'consent_deaths_Agree',
 'consent_deaths_Not agree',
 'consent_deaths_nan',
 '2020_mean_deaths'
 ]

df_deaths = df.copy()
df_deaths.drop(columns = ['tot_cases','conf_cases','prob_cases','new_case','pnew_case','consent_cases_Agree',
'consent_cases_Not agree','consent_cases_nan'], inplace = True)

df_deaths = df_deaths[columns_deaths]
df_deaths.head()

Unnamed: 0,Year_submitted,tot_death,conf_death,prob_death,new_death,pnew_death,state_AL,state_AZ,state_CA,state_CO,...,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan,2020_mean_deaths
646,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2584,2020,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3230,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3876,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4522,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


### **Step 7:** 

Save dataframes as csv files for folders Cases_Cleaned and Deaths_Cleaned

In [43]:
import os

In [44]:
# Save df_cases as csv file.

os.makedirs("Cases_Cleaned/",exist_ok=True)
df_cases.to_csv('Cases_Cleaned/ML_cases.csv', index = False)



In [45]:
# Save df_deaths as csv file.

os.makedirs("Deaths_Cleaned/",exist_ok=True)
df_deaths.to_csv('Deaths_Cleaned/ML_deaths.csv', index = False)



## MACHINE LEARNING

### FIRST MODEL

TITLE: cases

MODEL: RandomForest

FILE:  Cases_Cleaned/ML_cases.csv


In [46]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [47]:
# Loading data
file_path = Path("Cases_Cleaned/ML_cases.csv")
df_cases = pd.read_csv(file_path)
df_cases.head()

Unnamed: 0,Year_submitted,tot_cases,conf_cases,prob_cases,new_case,pnew_case,state_AL,state_AZ,state_CA,state_CO,...,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan,2020_mean_cases
0,2020,7,6,1,7,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,2020,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [48]:
# Define the features set.
X = df_cases.copy()
X = X.drop("2020_mean_cases", axis=1)
X.head()

Unnamed: 0,Year_submitted,tot_cases,conf_cases,prob_cases,new_case,pnew_case,state_AL,state_AZ,state_CA,state_CO,...,state_SD,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_cases_Agree,consent_cases_Not agree,consent_cases_nan
0,2020,7,6,1,7,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2020,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [49]:
# Define the target set.
y = df_cases["2020_mean_cases"].ravel()


In [50]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [51]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [52]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [53]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [54]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [55]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1844,0
Actual 1,0,2723


In [56]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [57]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1844,0
Actual 1,0,2723


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1844
           1       1.00      1.00      1.00      2723

    accuracy                           1.00      4567
   macro avg       1.00      1.00      1.00      4567
weighted avg       1.00      1.00      1.00      4567



In [58]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.5100915855976518, 'tot_cases'),
 (0.2004442034885411, 'conf_cases'),
 (0.0822030120806861, 'prob_cases'),
 (0.07969413376399635, 'Year_submitted'),
 (0.06376571197443659, 'new_case'),
 (0.061535256289518184, 'pnew_case'),
 (7.955981761725693e-05, 'consent_cases_Agree'),
 (7.223913684697964e-05, 'state_WY'),
 (7.161585491242348e-05, 'state_OK'),
 (7.123794150018815e-05, 'consent_cases_Not agree'),
 (6.966006595683275e-05, 'state_NYC'),
 (6.791359907523095e-05, 'state_MA'),
 (6.188741997501986e-05, 'state_GA'),
 (6.181808146716893e-05, 'state_OH'),
 (5.887149581939785e-05, 'state_MT'),
 (5.6929364655399175e-05, 'state_LA'),
 (5.6906076831460235e-05, 'state_KS'),
 (5.6496752475377034e-05, 'state_KY'),
 (5.643063093553585e-05, 'state_NC'),
 (5.588633723345982e-05, 'state_ND'),
 (5.497924031433463e-05, 'state_PR'),
 (5.3327685182957545e-05, 'state_VA'),
 (5.3003362743701254e-05, 'state_DE'),
 (5.180022878073517e-05, 'state_OR'),
 (5.1508150231214034e-05, 'state_RMI'),
 (5.08936889669023

In [None]:
# create a datframe with desired information and output as a csv file
# To Be Done Later

## MACHINE LEARNING

### FIRST MODEL

TITLE: deaths

MODEL: RandomForest

FILE:  Cases_Cleaned/ML_deaths.csv

In [59]:
# Loading data
file_path = Path("Deaths_Cleaned/ML_deaths.csv")
df_deaths = pd.read_csv(file_path)
df_deaths.head()

Unnamed: 0,Year_submitted,tot_death,conf_death,prob_death,new_death,pnew_death,state_AL,state_AZ,state_CA,state_CO,...,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan,2020_mean_deaths
0,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,2020,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [60]:
# Define the features set.
X = df_deaths.copy()
X = X.drop("2020_mean_deaths", axis=1)
X.head()

Unnamed: 0,Year_submitted,tot_death,conf_death,prob_death,new_death,pnew_death,state_AL,state_AZ,state_CA,state_CO,...,state_SD,state_TN,state_UT,state_VA,state_WI,state_WV,state_WY,consent_deaths_Agree,consent_deaths_Not agree,consent_deaths_nan
0,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2020,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2020,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [61]:
# Define the target set.
y = df_deaths["2020_mean_deaths"].ravel()


In [62]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [63]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [64]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [65]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [66]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [67]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2111,0
Actual 1,0,2456


In [68]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [69]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2111,0
Actual 1,0,2456


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2111
           1       1.00      1.00      1.00      2456

    accuracy                           1.00      4567
   macro avg       1.00      1.00      1.00      4567
weighted avg       1.00      1.00      1.00      4567



In [70]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.4664729510385972, 'tot_death'),
 (0.30623891787133856, 'conf_death'),
 (0.1069268440492987, 'prob_death'),
 (0.05687303374258207, 'new_death'),
 (0.038663480034305295, 'Year_submitted'),
 (0.02245576207544727, 'pnew_death'),
 (0.00012278061047137504, 'state_NE'),
 (7.70756045577898e-05, 'state_RMI'),
 (7.296378049457676e-05, 'state_TN'),
 (6.800300191664122e-05, 'state_CA'),
 (6.725536633501493e-05, 'state_WY'),
 (6.717326030209969e-05, 'state_WV'),
 (6.48472556690074e-05, 'state_VA'),
 (6.439751331815952e-05, 'state_DE'),
 (6.249868519135787e-05, 'consent_deaths_Agree'),
 (5.873099696045994e-05, 'state_IN'),
 (5.797386368937827e-05, 'state_NYC'),
 (5.7260364463365545e-05, 'state_WI'),
 (5.7130032332695044e-05, 'state_MI'),
 (5.7038270764128314e-05, 'state_OK'),
 (5.575582023818411e-05, 'state_ME'),
 (5.4801725375901596e-05, 'state_IL'),
 (5.446443832823349e-05, 'state_CT'),
 (5.441464860762554e-05, 'state_MS'),
 (5.408396413230889e-05, 'state_KS'),
 (5.2320665674166594e-05, 'state

In [None]:
# create a datframe with desired information and output as a csv file
# To Be Done Later