## Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cx_Oracle
from sklearn import preprocessing
import scipy as sp
from sklearn.cluster import KMeans

## Data Prep

In [None]:
########### Reading Data set ############################################################
covid=pd.read_csv(r"C:\Users\TCOSDEMIR\Desktop\Digital Masters\covid-tests-cases-deaths.csv")

In [None]:
########### Data Type Tuning & filling N/A ############################################################
covid.Date=pd.to_datetime(covid.Date)

covid.Death.fillna(0,inplace=True)
covid.Death=covid.Death.astype("int")

covid.Case.fillna(0,inplace=True)
covid.Case=covid.Case.astype("int")

covid.GDP=pd.DataFrame(covid.GDP).astype("int64")
covid.Population=pd.DataFrame(covid.Population).astype("int64")

## Analyze the data

In [None]:
###### Test numbers & Case Correlation ###########################
covid_corr=covid.groupby("Country").max().reset_index().dropna()
covid_corr[["Test","Death","Case","GDP","Population"]]=np.log(covid_corr[["Test","Death","Case","GDP","Population"]])

sns.set(font_scale = 2)
plt.figure(figsize=(10,8))
ax = sns.regplot(x="Test", y="GDP", data=covid_corr)


In [None]:
# Correlaiton coefficient btw metrics
covid.groupby("Country").max().reset_index().dropna().corr()

## Imputation

In [None]:
# It shows us there is strong correlation test btw case so we can impute N/A test metrics

# We need to found a ratio for every country related to case
df=covid.groupby("Country").max().reset_index().dropna()
ratio=pd.DataFrame()
ratio["Country"]=df.Country
ratio["ratio"]=df.Case/df.Test

# Now we can impute tihs ratio to every day country by country
covid_new=covid.merge(ratio,on="Country",how="inner")
covid_new.Test[covid_new.Test.isna()]=covid_new.Case[covid_new.Test.isna()]/covid_new.ratio[covid_new.Test.isna()]

# Adding GDP per capita 
covid_new["GDP_per_capita"]=(covid_new.GDP/covid_new.Population).astype("int")

# Test per GDP

covid_new["Test_perf"]=covid_new.Test/covid_new.GDP_per_capita

## Modelling the data

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict

df=covid.groupby("Country").max().reset_index().dropna()

X=df.iloc[:,2:].drop("Death",axis=1)
y=df.Death
X=sm.add_constant(X)


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)



lm = sm.OLS(y,X)
model=lm.fit()
model.summary()

## Visualization

In [None]:
sns.set(font_scale = 3)
plt.figure(figsize=(30,16))
ax = sns.barplot(x="Country", y="Test",palette="Blues", data=covid_new[(covid_new.Population>10000000) & (covid_new.Date=="2020-04-13") & (covid_new.Case>10000)].sort_values(by="Test"))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.savefig("test_output.png");