In [258]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import metrics

In [192]:
df = pd.read_csv("covid_data.csv")

## Listing All the Variables

In [16]:
print(df.shape)
variables = list(df.columns)
print(f"{len(variables)} variables")
variables

(238234, 67)
67 variables


['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'total_tests',
 'new_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'total_boosters',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'peo

## Checking Missing Values

In [153]:
df.isnull().sum()

iso_code                                        0
continent                                   13427
location                                        0
date                                            0
total_cases                                 13459
                                            ...  
population                                   1025
excess_mortality_cumulative_absolute       230242
excess_mortality_cumulative                230242
excess_mortality                           230191
excess_mortality_cumulative_per_million    230242
Length: 67, dtype: int64

In [18]:
df.isnull().sum().sum()

7290942

## Predicting Total Deaths per Million

Here, we predict the total deaths per million based on the latest death rate (per million), 

In [278]:
dfmodel1 = df[(df['total_deaths_per_million'].notna()) &
              (df['gdp_per_capita'].notna()) &
              (df['life_expectancy'].notna()) &
              (df['human_development_index'].notna()) &
              (df['total_vaccinations_per_hundred']) & 
              (df['people_fully_vaccinated_per_hundred']) &
             (df['female_smokers'].notna()) & 
             (df['median_age'].notna()) &
             (df['weekly_hosp_admissions'].notna())]
dfmodel1

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
20487,BEL,Europe,Belgium,2021-01-18,679771.0,932.0,2078.286,20472.0,37.0,50.000,...,31.4,,5.64,81.63,0.931,11655923.0,,,,
20488,BEL,Europe,Belgium,2021-01-19,681250.0,1479.0,1989.714,20554.0,82.0,51.429,...,31.4,,5.64,81.63,0.931,11655923.0,,,,
20489,BEL,Europe,Belgium,2021-01-20,684256.0,3006.0,2001.000,20572.0,18.0,46.000,...,31.4,,5.64,81.63,0.931,11655923.0,,,,
20490,BEL,Europe,Belgium,2021-01-21,686827.0,2571.0,1991.571,20620.0,48.0,46.571,...,31.4,,5.64,81.63,0.931,11655923.0,,,,
20491,BEL,Europe,Belgium,2021-01-22,689271.0,2444.0,2026.000,20675.0,55.0,46.143,...,31.4,,5.64,81.63,0.931,11655923.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224896,USA,North America,United States,2022-11-18,98306861.0,55042.0,44209.286,1077079.0,530.0,346.143,...,24.6,,2.77,78.86,0.926,338289856.0,,,,
224897,USA,North America,United States,2022-11-19,98311464.0,4603.0,44149.286,1077090.0,11.0,347.714,...,24.6,,2.77,78.86,0.926,338289856.0,,,,
224898,USA,North America,United States,2022-11-20,98314732.0,3268.0,44280.286,1077090.0,0.0,347.571,...,24.6,,2.77,78.86,0.926,338289856.0,,,,
224899,USA,North America,United States,2022-11-21,98357289.0,42557.0,43233.714,1077284.0,194.0,340.857,...,24.6,,2.77,78.86,0.926,338289856.0,,,,


In [226]:
dfmodel1 = dfmodel1.sort_values('date').groupby('location').tail(1)
dfmodel1.to_csv("dfmodel1.csv")
dfmodel1

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
125754,LUX,Europe,Luxembourg,2021-04-13,64062.0,182.0,205.286,777.0,5.0,2.286,...,26.0,,4.510,82.25,0.916,647601.0,,,,
27771,BIH,Europe,Bosnia and Herzegovina,2022-01-29,343986.0,0.0,2059.571,14310.0,0.0,46.571,...,47.7,97.164,3.500,77.40,0.780,3233530.0,,,,
13855,AUT,Europe,Austria,2022-01-31,1835602.0,28952.0,32300.571,17207.0,15.0,15.286,...,30.9,,7.370,81.54,0.922,8939617.0,,,,
221524,UKR,Europe,Ukraine,2022-02-27,5062889.0,4943.0,17065.857,112835.0,95.0,201.714,...,47.4,,8.800,72.06,0.779,39701744.0,,,,
110588,KAZ,Asia,Kazakhstan,2022-03-03,1391277.0,0.0,284.143,18963.0,5.0,10.000,...,43.1,98.999,6.700,73.60,0.825,19397998.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199602,KOR,Asia,South Korea,2022-11-27,26959843.0,22327.0,53998.143,30413.0,44.0,49.571,...,40.9,,12.270,83.03,0.916,51815808.0,,,,
163967,PAK,Asia,Pakistan,2022-11-27,1575124.0,159.0,30.143,30630.0,0.0,0.000,...,36.7,59.607,0.600,67.27,0.557,235824864.0,,,,
30052,BRA,South America,Brazil,2022-11-27,35149503.0,,,689442.0,,,...,17.9,,2.200,75.88,0.765,215313504.0,,,,
115395,KGZ,Asia,Kyrgyzstan,2022-11-27,206511.0,0.0,4.000,2991.0,0.0,0.000,...,50.5,89.220,4.500,71.45,0.697,6630621.0,,,,


In [247]:
df[df['location'] == "United Kingdom"].isna().sum().sum() #Checking number of missings according to country

16091

In [227]:
#get all the numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

dfmodel1Numeric = dfmodel1.select_dtypes(include=numerics)

In [228]:
#get the correlation matrix
corrMatrix = dfmodel1Numeric.corr()
corrMatrix

Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
total_cases,1.000000,0.932489,0.977060,0.994310,0.763866,0.981375,0.004372,0.008559,0.001430,0.011483,...,0.000234,0.038570,0.024269,0.038531,0.051730,0.966691,0.750881,-0.542035,-0.292562,-0.423498
new_cases,0.932489,1.000000,0.969666,0.912627,0.791608,0.937088,0.059121,0.225436,0.147751,-0.006808,...,-0.000919,0.035046,0.169996,0.115117,0.119447,0.880501,-0.468745,0.209761,0.611877,0.075704
new_cases_smoothed,0.977060,0.969666,1.000000,0.963999,0.738473,0.968919,0.013470,0.092591,0.097100,-0.017962,...,0.016502,0.028533,0.118451,0.059296,0.065273,0.941917,0.575030,-0.329810,-0.508688,-0.199206
total_deaths,0.994310,0.912627,0.963999,1.000000,0.761115,0.977497,-0.030200,-0.015623,-0.019801,0.034981,...,-0.002486,0.045359,-0.009947,0.016571,0.033069,0.966203,0.729975,-0.515634,-0.322165,-0.395115
new_deaths,0.763866,0.791608,0.738473,0.761115,1.000000,0.836020,0.077961,0.093847,0.059042,0.109748,...,0.000387,0.058433,0.124070,0.119739,0.160227,0.658573,0.736133,-0.523370,-0.313581,-0.403415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
population,0.966691,0.880501,0.941917,0.966203,0.658573,0.931072,-0.080387,-0.026512,-0.030070,-0.057410,...,0.014986,0.035263,-0.029659,-0.020440,-0.017993,1.000000,0.957895,-0.999892,0.656154,-0.992703
excess_mortality_cumulative_absolute,0.750881,-0.468745,0.575030,0.729975,0.736133,0.762993,-0.532471,-0.990853,-0.882545,-0.429351,...,-0.634977,,-0.941745,0.623650,0.192411,0.957895,1.000000,-0.962006,0.411861,-0.916284
excess_mortality_cumulative,-0.542035,0.209761,-0.329810,-0.515634,-0.523370,-0.557518,0.743344,0.990050,0.977399,0.659620,...,0.399930,,0.997791,-0.386528,0.082825,-0.999892,-0.962006,1.000000,-0.645008,0.990827
excess_mortality,-0.292562,0.611877,-0.508688,-0.322165,-0.313581,-0.274789,-0.990627,-0.531059,-0.791979,-0.999814,...,0.442444,,-0.694345,-0.455468,-0.814973,0.656154,0.411861,-0.645008,1.000000,-0.742362


In [240]:
deathRateCol = corrMatrix['total_deaths_per_million'] 
deathRateCol[abs(corrMatrix['total_deaths_per_million']) > 0.2] 

total_cases_per_million                       0.479740
total_deaths_per_million                      1.000000
new_deaths_per_million                        0.219160
new_deaths_smoothed_per_million               0.300051
reproduction_rate                             0.252954
icu_patients                                  0.257902
hosp_patients                                 0.250777
hosp_patients_per_million                    -0.238855
weekly_icu_admissions                         0.227114
weekly_hosp_admissions                        0.444377
total_tests                                  -0.243342
total_tests_per_thousand                     -0.342626
new_tests_per_thousand                       -0.254059
new_vaccinations                             -0.301598
total_boosters_per_hundred                    0.263116
new_people_vaccinated_smoothed_per_hundred   -0.227878
stringency_index                             -0.246012
median_age                                    0.626280
aged_65_ol

In [208]:
corrMatrix.to_csv("corrMatrix.csv")

In [280]:
X = dfmodel1[["median_age", "female_smokers", "weekly_hosp_admissions"]]
Y = dfmodel1[["total_deaths_per_million"]]

In [281]:
#Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

In [282]:
#Normalization
x_scaler = preprocessing.StandardScaler().fit(X_train)
X_train = x_scaler.transform(X_train)

In [283]:
y_scaler = preprocessing.StandardScaler().fit(y_train)
y_train = y_scaler.transform(y_train)

In [284]:
model1=LinearRegression()
model1.fit(X_train , y_train)

In [285]:
y_pred= model1.predict(X_train)

In [286]:
MSE = metrics.mean_squared_error(y_train, y_pred)
RMSE = np.sqrt(MSE)
pd.DataFrame([MSE, RMSE], index=['MSE', 'RMSE'], columns=['Metrics'])

Unnamed: 0,Metrics
MSE,0.708521
RMSE,0.841737
