In [1]:
#Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from scipy.stats import norm
import pickle

In [2]:
# Read in CSV for Dataframe
data = pd.read_csv("covid19.csv")
df = pd.DataFrame(data)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,Footnote
0,11/24/2021,01-01-2020,11/20/2021,By Total,,,United States,All Sexes,All Ages,773812,6294875,695560,399005,9451,
1,11/24/2021,01-01-2020,11/20/2021,By Total,,,United States,All Sexes,Under 1 year,151,35550,391,19,23,
2,11/24/2021,01-01-2020,11/20/2021,By Total,,,United States,All Sexes,0-17 years,621,62999,1132,170,189,
3,11/24/2021,01-01-2020,11/20/2021,By Total,,,United States,All Sexes,1-4 years,68,6606,224,19,65,
4,11/24/2021,01-01-2020,11/20/2021,By Total,,,United States,All Sexes,5-14 years,199,10500,330,65,80,


In [3]:
# Remove all columns except the Features and Independent Data
df = df[["Group", "Year", "Month", "State", "Sex", "Age Group", "COVID-19 Deaths"]]

In [4]:
# Rename columns for better readability
df = df.rename({"COVID-19 Deaths": "Deaths"}, axis=1)
df = df.rename({"Age Group": "Age"}, axis=1)

In [5]:
# Finding out how many rows are affected by null values
df = df[df["Deaths"].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53601 entries, 0 to 71597
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Group   53601 non-null  object 
 1   Year    51345 non-null  float64
 2   Month   46949 non-null  float64
 3   State   53601 non-null  object 
 4   Sex     53601 non-null  object 
 5   Age     53601 non-null  object 
 6   Deaths  53601 non-null  object 
dtypes: float64(2), object(5)
memory usage: 3.3+ MB


In [6]:
# Dropping those values and checking the removal
df = df.dropna()
df.isnull().sum()

Group     0
Year      0
Month     0
State     0
Sex       0
Age       0
Deaths    0
dtype: int64

In [7]:
# Dropping rows that are not grouped by Month
# Dropping Group column after clean
df = df[df["Group"] == "By Month"]
df = df.drop("Group", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46949 entries, 8262 to 71597
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    46949 non-null  float64
 1   Month   46949 non-null  float64
 2   State   46949 non-null  object 
 3   Sex     46949 non-null  object 
 4   Age     46949 non-null  object 
 5   Deaths  46949 non-null  object 
dtypes: float64(2), object(4)
memory usage: 2.5+ MB


In [8]:
# Checking unique values in Sex column
df["Sex"].unique()

array(['All Sexes', 'Male', 'Female'], dtype=object)

In [9]:
# Drop All Sexes rows from the column
df.drop(df[df['Sex'] == "All Sexes"].index, inplace = True)

In [10]:
# After clean
df["Sex"].unique()

array(['Male', 'Female'], dtype=object)

In [11]:
# Checking Age unique values
df["Age"].unique()

array(['All Ages', 'Under 1 year', '0-17 years', '1-4 years',
       '5-14 years', '15-24 years', '18-29 years', '25-34 years',
       '30-39 years', '35-44 years', '40-49 years', '45-54 years',
       '50-64 years', '55-64 years', '65-74 years', '75-84 years',
       '85 years and over'], dtype=object)

In [12]:
# Dropping All Ages rows from Age column
df.drop(df[df['Age'] == "All Ages"].index, inplace = True)

In [13]:
# Cleaned
df.columns

Index(['Year', 'Month', 'State', 'Sex', 'Age', 'Deaths'], dtype='object')

In [14]:
# Function for returning averaged floats for the grouped Age values
def clean_experience(x):
    if x == "Under 1 year":
        return 1
    if x == "0-17 years":
        return 10
    if x == "1-4 years":
        return 10
    if x == "5-14 years":
        return 10
    if x == "15-24 years":
        return 20
    if x == "18-29 years":
        return 20
    if x == "25-34 years":
        return 30
    if x == "30-39 years":
        return 30
    if x == "35-44 years":
        return 40
    if x == "40-49 years":
        return 40
    if x == "45-54 years":
        return 50
    if x == "50-64 years":
        return 50
    if x == "55-64 years":
        return 60
    if x == "65-74 years":
        return 70
    if x == "75-84 years":
        return 80
    if x == "85 years and over":
        return 90
    return float(x)

# Applying function to the series
df["Age"] = df["Age"].apply(clean_experience)

In [15]:
# Showing new float array
df["Age"].unique()

array([ 1, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=int64)

In [16]:
# Dropping United States from the State column
df.drop(df[df['State'] == "United States"].index, inplace = True)

In [17]:
# Checking unique values from State column
df["State"].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'New York City',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'],
      dtype=object)

In [18]:
# Grouping States together into Region for exploratory feature

# New England
df["State"] = df["State"].replace("Maine", "New England")
df["State"] = df["State"].replace("Rhode Island", "New England")
df["State"] = df["State"].replace("Vermont", "New England")
df["State"] = df["State"].replace("Connecticut", "New England")
df["State"] = df["State"].replace("New Hampshire", "New England")
df["State"] = df["State"].replace("Massachusetts", "New England")

# Mid Atlantic
df["State"] = df["State"].replace("New York", "Mid Atlantic")
df["State"] = df["State"].replace("New York City", "Mid Atlantic")
df["State"] = df["State"].replace("New Jersey", "Mid Atlantic")
df["State"] = df["State"].replace("Pennsylvania", "Mid Atlantic")
df["State"] = df["State"].replace("District of Columbia", "Mid Atlantic")


# Southern
df["State"] = df["State"].replace("Virginia", "Southern")
df["State"] = df["State"].replace("West Virginia", "Southern")
df["State"] = df["State"].replace("Kentucky", "Southern")
df["State"] = df["State"].replace("Delaware", "Southern")
df["State"] = df["State"].replace("Maryland", "Southern")
df["State"] = df["State"].replace("North Carolina", "Southern")
df["State"] = df["State"].replace("South Carolina", "Southern")
df["State"] = df["State"].replace("Tennessee", "Southern")
df["State"] = df["State"].replace("Arkansas", "Southern")
df["State"] = df["State"].replace("Louisiana", "Southern")
df["State"] = df["State"].replace("Florida", "Southern")
df["State"] = df["State"].replace("Georgia", "Southern")
df["State"] = df["State"].replace("Alabama", "Southern")
df["State"] = df["State"].replace("Mississippi", "Southern")

# Mid West
df["State"] = df["State"].replace("Michigan", "Mid West")
df["State"] = df["State"].replace("North Dakota", "Mid West")
df["State"] = df["State"].replace("South Dakota", "Mid West")
df["State"] = df["State"].replace("Iowa", "Mid West")
df["State"] = df["State"].replace("Minnesota", "Mid West")
df["State"] = df["State"].replace("Kansas", "Mid West")
df["State"] = df["State"].replace("Nebraska", "Mid West")
df["State"] = df["State"].replace("Ohio", "Mid West")
df["State"] = df["State"].replace("Indiana", "Mid West")
df["State"] = df["State"].replace("Illinois", "Mid West")
df["State"] = df["State"].replace("Wisconsin", "Mid West")
df["State"] = df["State"].replace("Missouri", "Mid West")

#South West
df["State"] = df["State"].replace("Texas", "South West")
df["State"] = df["State"].replace("Arizona", "South West")
df["State"] = df["State"].replace("New Mexico", "South West")
df["State"] = df["State"].replace("Oklahoma", "South West")

#Rocky Mountains
df["State"] = df["State"].replace("Montana", "Rocky Mountains")
df["State"] = df["State"].replace("Idaho", "Rocky Mountains")
df["State"] = df["State"].replace("Colorado", "Rocky Mountains")
df["State"] = df["State"].replace("Utah", "Rocky Mountains")
df["State"] = df["State"].replace("Wyoming", "Rocky Mountains")
df["State"] = df["State"].replace("Nevada", "Rocky Mountains")

#Pacific Coastal
df["State"] = df["State"].replace("California", "Pacific Coastal")
df["State"] = df["State"].replace("Oregon", "Pacific Coastal")
df["State"] = df["State"].replace("Washington", "Pacific Coastal")

#North West
df["State"] = df["State"].replace("Alaska", "North West")

#Oceanic
df["State"] = df["State"].replace("Hawaii", "Oceanic")

#Caribbean
df["State"] = df["State"].replace("Puerto Rico", "Caribbean")



In [19]:
# Checking the grouping
df["State"].unique()

array(['Southern', 'North West', 'South West', 'Pacific Coastal',
       'Rocky Mountains', 'New England', 'Mid Atlantic', 'Oceanic',
       'Mid West', 'Caribbean'], dtype=object)

In [20]:
# Rename State to Region for clarity
df = df.rename({"State": "Region"}, axis=1)

In [21]:
# Checking Dtypes to see what needs to be encoded
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28005 entries, 9453 to 71597
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    28005 non-null  float64
 1   Month   28005 non-null  float64
 2   Region  28005 non-null  object 
 3   Sex     28005 non-null  object 
 4   Age     28005 non-null  int64  
 5   Deaths  28005 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 1.5+ MB


In [22]:
# building encoder for Region column
le_region = LabelEncoder()
df["Region"] = le_region.fit_transform(df["Region"])
df["Region"].unique()

array([9, 4, 8, 6, 7, 3, 1, 5, 2, 0])

In [23]:
# building encoder for Sex column
le_sex = LabelEncoder()
df["Sex"] = le_sex.fit_transform(df["Sex"])
df["Sex"].unique()

array([1, 0])

In [24]:
# removing the commas from Deaths to replace to float values
df["Deaths"] = [float(str(i).replace(",", "")) for i in df["Deaths"]]

In [25]:
# Grouping the features and independent value for model testing
X = df.drop("Deaths", axis=1)
y = df["Deaths"]

In [26]:
# Creating Linear Regression model and fitting values
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)

LinearRegression()

In [27]:
# Adding the predict module to the X features
y_pred = linear_reg.predict(X)

In [28]:
# Finding the difference between the actual Death data and prediction Deaths
error = np.sqrt(mean_squared_error(y, y_pred))
error

104.10733840420126

In [29]:
# Building Decision Tree Regressor, adding a random state and fitting the values
dec_tree_reg = DecisionTreeRegressor(random_state=1)
dec_tree_reg.fit(X, y.values)

DecisionTreeRegressor(random_state=1)

In [30]:
# Adding the DTR predict module to the X features
y_pred = dec_tree_reg.predict(X)

In [31]:
# Finding the difference between the actual Death data and prediction Deaths for DTR
error = np.sqrt(mean_squared_error(y, y_pred))
error

79.58938381766252

In [32]:
# Building Random Forsest Regressor, adding same random state, and fitting the values
random_forest_reg = RandomForestRegressor(random_state=1)
random_forest_reg.fit(X, y.values)

RandomForestRegressor(random_state=1)

In [33]:
# Adding the RFR predict module to the X features
y_pred = random_forest_reg.predict(X)

In [34]:
# Finding the difference between the actual Death data and prediction Deaths for RFR
error = np.sqrt(mean_squared_error(y, y_pred))
error

79.74871168712677

In [35]:
# Using the fit and score method of GridSearchCV on the Decision Tree Regressor
# Using multiple max_depth parameters to test and score
# Using Mean Squared Error scoring to find the mean squared errors of each parameter
# Fit the GridSearchCV to the values
max_depth = [None, 2, 4, 6, 8, 10, 12]
params = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=1)
gsearch = GridSearchCV(regressor, params, scoring="neg_mean_squared_error")
gsearch.fit(X, y.values)

GridSearchCV(estimator=DecisionTreeRegressor(random_state=1),
             param_grid={'max_depth': [None, 2, 4, 6, 8, 10, 12]},
             scoring='neg_mean_squared_error')

In [36]:
#Add Best Estimator module to GridSearchCV and then fit the values
# Add Predict module to the features
# Pass the y and y_pred through the Mean Squared Error module
regressor = gsearch.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
error

103.53361401192186

In [37]:
# Features in binary and float data
X

Unnamed: 0,Year,Month,Region,Sex,Age
9453,2020.0,1.0,9,1,1
9454,2020.0,1.0,9,1,10
9455,2020.0,1.0,9,1,10
9456,2020.0,1.0,9,1,10
9457,2020.0,1.0,9,1,20
...,...,...,...,...,...
71593,2021.0,11.0,0,0,20
71594,2021.0,11.0,0,0,30
71595,2021.0,11.0,0,0,30
71596,2021.0,11.0,0,0,40


In [38]:
# Creating a "model" array for pulling values
X = np.array([[2020, 11, "Mid West", "Male", 20]])
X

array([['2020', '11', 'Mid West', 'Male', '20']], dtype='<U11')

In [39]:
# Take the encoded Region and Sex columns and change to float type


X[:, 2] = le_region.transform(X[:, 2])
X[:, 3] = le_sex.transform(X[:, 3])

X = X.astype(float)
X

array([[2.02e+03, 1.10e+01, 2.00e+00, 1.00e+00, 2.00e+01]])

In [40]:
# Grab Decision Tree Regressor and add predict to the features
y_pred = regressor.predict(X)
y_pred



array([1.15690737])

In [41]:
# Testing the Pickle data file by re-opening into notebook
with open("saved_pick.pkl", "rb") as file:
    data = pickle.load(file)

# Setting encoders to the correct data series    
regressor_loaded = data["model"]
le_region = data["le_region"]
le_sex = data["le_sex"]

In [42]:
# Adding the predict module to the pickle model
y_pred = regressor_loaded.predict(X)
y_pred



array([1.15690737])

In [43]:
# Creating a copy of independent and features for logistic modeling
X = df.drop("Deaths", axis=1)
y = df["Deaths"].copy()

In [44]:
# CONVERT TO BINARY Classification
y.iloc[y > 0] = 1
# np.sqrt(np.sum((y - np.mean(y)) **2) /len(y))

In [45]:
# Creating the train and test sets and fitting the scaler
np.random.seed(1)
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.03,random_state= 1)
# Feature Scaling
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train) #ONLY FIT to train data!!
X_test = scaler.transform(X_test)

In [46]:
# Scaling and predicting the logistic regression model with the l2 penalty argument
# L2, Ridge Regression is used to correct fitting problems with models by regularization
# It cuts the gap on large variances with the model
# Showing accuracy score
logmodel_scaled = LogisticRegression(penalty='l2', solver='sag', max_iter=50)
logmodel_scaled.fit(X_train,y_train)
predictions_scaled = logmodel_scaled.predict(X_test)
print(classification_report(y_test,predictions_scaled))
print(logmodel_scaled.score(X_test, y_test))

              precision    recall  f1-score   support

         0.0       0.91      0.91      0.91       515
         1.0       0.86      0.87      0.86       326

    accuracy                           0.89       841
   macro avg       0.89      0.89      0.89       841
weighted avg       0.89      0.89      0.89       841

0.8941736028537456


In [47]:
# Appending the logistic regressor model to the encoders and then saved to the pickle file
data1 = {"model": regressor, "le_region": le_region, "le_sex": le_sex, 
         "logestic_model": logmodel_scaled, "scaler_model": scaler}

with open("saved_pick.pkl", "wb") as file:
    pickle.dump(data1, file)

In [48]:
# Creating the model array template and then fitting to the scaler
X = np.array([[2020, 11, "Mid West", "Male", 60]])

X[:, 2] = le_region.transform(X[:, 2])
X[:, 3] = le_sex.transform(X[:, 3])
X = scaler.transform(X)
X = X.astype(float)
X



array([[0.        , 0.90909091, 0.22222222, 1.        , 0.66292135]])

In [49]:
# Predicted probability with the model scaled
logmodel_scaled.predict_proba(X)[0,0]

0.12458538454123835

In [50]:
2*norm.cdf(0.87) - 1

0.6156995957926077