In [183]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import metrics


import warnings
warnings.filterwarnings('ignore')

In [184]:
df = pd.read_csv("covid_data.csv")

## Listing All the Variables

In [185]:
print(df.shape)
variables = list(df.columns)
print(f"{len(variables)} variables")

(238234, 67)
67 variables


## Dealing with Missing Values

In [186]:
df = df.fillna(value = 0)
df.isnull().sum().sum()

0

## Get the Correlation Matrix

In [187]:
corrMatrix = df.corr()

In [188]:
deathRateCol = corrMatrix['total_deaths'] 
columnsToExplore = deathRateCol[abs(corrMatrix['total_deaths']) > 0.5]
print("Column to explore", len(columnsToExplore))
columnsToExplore

Column to explore 14


total_cases                       0.936711
new_cases                         0.706417
new_cases_smoothed                0.726616
total_deaths                      1.000000
new_deaths                        0.607181
new_deaths_smoothed               0.626021
total_vaccinations                0.833605
people_vaccinated                 0.853855
people_fully_vaccinated           0.836693
total_boosters                    0.754619
new_vaccinations                  0.623179
new_vaccinations_smoothed         0.635393
new_people_vaccinated_smoothed    0.536540
population                        0.713426
Name: total_deaths, dtype: float64

In [189]:
indexList = columnsToExplore.index.values.tolist() #variables that might be useful

In [190]:
for i in range(len(indexList)):              #get rid of rows with 0 entry for the prospective predictors
    df = df[df[indexList[i]] != 0]
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
1631,OWID_AFR,0,Africa,2021-10-28,8491909.0,5895.0,5315.429,217820.0,199.0,202.429,...,0.0,0.000,0.0,0.00,0.000,1.426737e+09,0.0,0.0,0.0,0.0
1632,OWID_AFR,0,Africa,2021-10-29,8496820.0,4911.0,5353.000,217987.0,167.0,199.714,...,0.0,0.000,0.0,0.00,0.000,1.426737e+09,0.0,0.0,0.0,0.0
1633,OWID_AFR,0,Africa,2021-10-30,8500461.0,3641.0,5299.571,218109.0,122.0,189.286,...,0.0,0.000,0.0,0.00,0.000,1.426737e+09,0.0,0.0,0.0,0.0
1634,OWID_AFR,0,Africa,2021-10-31,8504545.0,4084.0,5412.000,218244.0,135.0,186.857,...,0.0,0.000,0.0,0.00,0.000,1.426737e+09,0.0,0.0,0.0,0.0
1635,OWID_AFR,0,Africa,2021-11-01,8509222.0,4677.0,5122.857,218395.0,151.0,181.857,...,0.0,0.000,0.0,0.00,0.000,1.426737e+09,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238132,ZWE,Africa,Zimbabwe,2022-08-18,256579.0,14.0,9.429,5589.0,1.0,0.286,...,30.7,36.791,1.7,61.49,0.571,1.632054e+07,0.0,0.0,0.0,0.0
238134,ZWE,Africa,Zimbabwe,2022-08-20,256616.0,20.0,13.429,5592.0,3.0,0.714,...,30.7,36.791,1.7,61.49,0.571,1.632054e+07,0.0,0.0,0.0,0.0
238139,ZWE,Africa,Zimbabwe,2022-08-25,256675.0,47.0,13.714,5593.0,1.0,0.571,...,30.7,36.791,1.7,61.49,0.571,1.632054e+07,0.0,0.0,0.0,0.0
238168,ZWE,Africa,Zimbabwe,2022-09-23,257289.0,30.0,50.000,5599.0,1.0,0.429,...,30.7,36.791,1.7,61.49,0.571,1.632054e+07,0.0,0.0,0.0,0.0


In [191]:
corrMatrix2 = df.corr()#update the correlation matrix
deathRateCol2 = corrMatrix2['total_deaths'] 
columnsToExplore2 = deathRateCol2[abs(corrMatrix2['total_deaths']) >0.5]
print("Column to explore", len(columnsToExplore2))
columnsToExplore2

Column to explore 14


total_cases                       0.928180
new_cases                         0.663495
new_cases_smoothed                0.686475
total_deaths                      1.000000
new_deaths                        0.606598
new_deaths_smoothed               0.626728
total_vaccinations                0.842060
people_vaccinated                 0.843710
people_fully_vaccinated           0.826610
total_boosters                    0.741795
new_vaccinations                  0.605218
new_vaccinations_smoothed         0.618297
new_people_vaccinated_smoothed    0.504893
population                        0.834020
Name: total_deaths, dtype: float64

## Doing a Multiple Linear Regression

In [192]:
df[['new_vaccinations', 'people_vaccinated']].corr()

Unnamed: 0,new_vaccinations,people_vaccinated
new_vaccinations,1.0,0.60039
people_vaccinated,0.60039,1.0


In [193]:
X = df[["population", "total_boosters", "people_vaccinated", "total_cases"]]
Y = df[["total_deaths"]]

In [194]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

## Normalization

In [195]:
x_scaler = preprocessing.StandardScaler().fit(X_train)
X_train = x_scaler.transform(X_train)

y_scaler = preprocessing.StandardScaler().fit(y_train)
y_train = y_scaler.transform(y_train)

## Model Training

In [196]:
model1=LinearRegression()
model1.fit(X_train , y_train)
y_pred= model1.predict(X_train)

## Model Testing: Training Data

In [197]:
MSE = metrics.mean_squared_error(y_train, y_pred)
RMSE = np.sqrt(MSE)
pd.DataFrame([MSE, RMSE], index=['MSE', 'RMSE'], columns=['Metrics'])

Unnamed: 0,Metrics
MSE,0.073474
RMSE,0.27106


In [198]:
metrics.r2_score(y_train, y_pred)

0.9265263755890519

## Model Testing: Testing Data

In [199]:
X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
y_pred= model1.predict(X_test)

In [200]:
MSE = metrics.mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
pd.DataFrame([MSE, RMSE], index=['MSE', 'RMSE'], columns=['Metrics'])

Unnamed: 0,Metrics
MSE,0.075736
RMSE,0.275202


In [201]:
metrics.r2_score(y_test, y_pred)

0.9258158003084846