In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
from sklearn import neighbors
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing as pre

In [3]:
covid_data = pd.read_csv('covid_data_clean.csv')
countries_data = pd.read_csv('countries_data_clean.csv',decimal=',')

In [4]:
covid_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,Ellapsed Days
0,0,Afghanistan,2020-02-24,1,0,0,0
1,1,Afghanistan,2020-02-25,1,0,0,1
2,2,Afghanistan,2020-02-26,1,0,0,2
3,3,Afghanistan,2020-02-27,1,0,0,3
4,4,Afghanistan,2020-02-28,1,0,0,4
5,5,Afghanistan,2020-02-29,1,0,0,5
6,6,Afghanistan,2020-03-01,1,0,0,6
7,7,Afghanistan,2020-03-02,1,0,0,7
8,8,Afghanistan,2020-03-03,1,0,0,8
9,9,Afghanistan,2020-03-04,1,0,0,9


In [5]:
countries_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate
0,0,Afghanistan,31056997,48.0,163.07,700.0,20.34
1,1,Albania,3581655,124.6,21.52,4500.0,5.22
2,2,Algeria,32930091,13.8,31.0,6000.0,4.61
3,3,Samoa,57794,290.4,9.27,8000.0,3.27
4,4,Andorra,71201,152.1,4.05,19000.0,6.25
5,5,Angola,12127071,9.7,191.19,1900.0,24.2
6,7,Antigua and Barbuda,69108,156.0,19.46,11000.0,5.37
7,8,Argentina,39921833,14.4,15.18,11200.0,7.55
8,9,Armenia,2976372,99.9,23.28,3500.0,8.23
9,11,Australia,20264082,2.6,4.69,29000.0,7.51


In [6]:
#Merge the two columns together and drop uncessary columns
merged_inner = pd.merge(left=covid_data, right=countries_data, left_on='Country/Region', right_on='Country').drop(columns=['Country', 'Date', 'Unnamed: 0_x', 'Unnamed: 0_y'])

#Drop rows with at least one missing value
merged_inner.dropna(inplace=True)

def createPrevValues(dataset, originalColumnName, commonColumnName, newColumnName):
    dataset[newColumnName] = dataset.loc[merged_inner[commonColumnName].shift(-1)==dataset[commonColumnName], originalColumnName]
    dataset[newColumnName] = dataset[newColumnName].shift()
    dataset[newColumnName].fillna(0, inplace=True)

#Get cases of previous day
createPrevValues(merged_inner, 'Confirmed', 'Country/Region', 'Prev_Confirmed')

#Get deaths of previous day
createPrevValues(merged_inner, 'Deaths', 'Country/Region', 'Prev_Deaths')

#Get Recovered of previous day
createPrevValues(merged_inner, 'Recovered', 'Country/Region', 'Prev_Recovered')


merged_inner.head(50)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
0,Afghanistan,1,0,0,0,31056997,48.0,163.07,700.0,20.34,0.0,0.0,0.0
1,Afghanistan,1,0,0,1,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
2,Afghanistan,1,0,0,2,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
3,Afghanistan,1,0,0,3,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
4,Afghanistan,1,0,0,4,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
5,Afghanistan,1,0,0,5,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
6,Afghanistan,1,0,0,6,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
7,Afghanistan,1,0,0,7,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
8,Afghanistan,1,0,0,8,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
9,Afghanistan,1,0,0,9,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0


In [7]:
#Get set with the values that the model receives
X = {'Confirmed' : merged_inner.drop(columns=[ 'Country/Region', 'Confirmed', 'Recovered']) , 'Deaths' : merged_inner.drop(columns=[ 'Country/Region', 'Deaths', 'Recovered'])}

#Values the model should predict
y = {'Confirmed' : merged_inner['Confirmed']  , 'Deaths' : merged_inner['Deaths']}

In [8]:
from sklearn import metrics


variables = ['Confirmed' , 'Deaths']
num_nei = [1 , 2 , 3 , 5 , 10 , 15 , 20 , 25]

for variable in variables:
    print("On " , variable)
    X_train, X_test, y_train, y_test = train_test_split(X[variable], y[variable], test_size=0.2, random_state=0)
    
    for num in num_nei:
        knn = neighbors.KNeighborsRegressor(num , weights='distance')

        knn.fit(X_train , y_train)

        y_pred = knn.predict(X_test)

        df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
        print(df)

        #Show Errors
        print("---------ERRORS-----------")
        print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
        print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
        print('--------')
        
print('Done')        

On  Confirmed
       Actual  Predicted
6116      101      101.0
9997        4        1.0
2917      189      253.0
9731    10801    10804.0
1386        5        3.0
...       ...        ...
9057        2        4.0
6871      717      585.0
10935       1        9.0
7863      167      179.0
9650       28       27.0

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 330.12041432887355
Mean Squared Error: 3722990.7807509713
Root Mean Squared Error: 1929.505320218364
--------
       Actual     Predicted
6116      101    101.000000
9997        4      1.000000
2917      189    175.145589
9731    10801  10804.516288
1386        5      3.000000
...       ...           ...
9057        2      4.000000
6871      717    543.272977
10935       1      5.686292
7863      167    167.742880
9650       28     27.466517

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 131.0259890469051
Mean Squared Error: 1027078.7062428932
Root Mean Squared Error: 1013.4489164

Neural Network Regression
=======================