In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing as pre

In [3]:
covid_data = pd.read_csv('covid_data_clean.csv')
countries_data = pd.read_csv('countries_data_clean.csv',decimal=',')

In [4]:
covid_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,Ellapsed Days
0,0,Afghanistan,2020-02-24,1,0,0,0
1,1,Afghanistan,2020-02-25,1,0,0,1
2,2,Afghanistan,2020-02-26,1,0,0,2
3,3,Afghanistan,2020-02-27,1,0,0,3
4,4,Afghanistan,2020-02-28,1,0,0,4
5,5,Afghanistan,2020-02-29,1,0,0,5
6,6,Afghanistan,2020-03-01,1,0,0,6
7,7,Afghanistan,2020-03-02,1,0,0,7
8,8,Afghanistan,2020-03-03,1,0,0,8
9,9,Afghanistan,2020-03-04,1,0,0,9


In [5]:
countries_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate
0,0,Afghanistan,31056997,48.0,163.07,700.0,20.34
1,1,Albania,3581655,124.6,21.52,4500.0,5.22
2,2,Algeria,32930091,13.8,31.0,6000.0,4.61
3,3,Samoa,57794,290.4,9.27,8000.0,3.27
4,4,Andorra,71201,152.1,4.05,19000.0,6.25
5,5,Angola,12127071,9.7,191.19,1900.0,24.2
6,7,Antigua and Barbuda,69108,156.0,19.46,11000.0,5.37
7,8,Argentina,39921833,14.4,15.18,11200.0,7.55
8,9,Armenia,2976372,99.9,23.28,3500.0,8.23
9,11,Australia,20264082,2.6,4.69,29000.0,7.51


In [6]:
#Merge the two columns together and drop uncessary columns
merged_inner = pd.merge(left=covid_data, right=countries_data, left_on='Country/Region', right_on='Country').drop(columns=['Country', 'Date', 'Unnamed: 0_x', 'Unnamed: 0_y'])

#Drop rows with at least one missing value
merged_inner.dropna(inplace=True)

def createPrevValues(dataset, originalColumnName, commonColumnName, newColumnName):
    dataset[newColumnName] = dataset.loc[merged_inner[commonColumnName].shift(-1)==dataset[commonColumnName], originalColumnName]
    dataset[newColumnName] = dataset[newColumnName].shift()
    dataset[newColumnName].fillna(0, inplace=True)

#Get cases of previous day
createPrevValues(merged_inner, 'Confirmed', 'Country/Region', 'Prev_Confirmed')

#Get deaths of previous day
createPrevValues(merged_inner, 'Deaths', 'Country/Region', 'Prev_Deaths')

#Get Recovered of previous day
createPrevValues(merged_inner, 'Recovered', 'Country/Region', 'Prev_Recovered')

merged_inner.head(50)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
0,Afghanistan,1,0,0,0,31056997,48.0,163.07,700.0,20.34,0.0,0.0,0.0
1,Afghanistan,1,0,0,1,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
2,Afghanistan,1,0,0,2,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
3,Afghanistan,1,0,0,3,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
4,Afghanistan,1,0,0,4,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
5,Afghanistan,1,0,0,5,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
6,Afghanistan,1,0,0,6,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
7,Afghanistan,1,0,0,7,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
8,Afghanistan,1,0,0,8,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
9,Afghanistan,1,0,0,9,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0


In [10]:
#Get set with the values that the model receives
X = merged_inner.drop(columns=['Country/Region', 'Deaths'])

#Values the model should predict
y = merged_inner['Confirmed']

In [11]:
from sklearn import metrics
#Test case
variables = ['Cases' , 'Deaths']

variablesScores = {}

iteration_scores = []
    
#different nr of iterations to compare results
max_iter_array = [50, 100, 150, 200, 250]

#Setting up training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
for it in max_iter_array:
    mlpr = MLPRegressor(max_iter = it)
    print('On iteration ', it)
        
    #need to fix date
    mlpr.fit(X_train , y_train)
        
    #predict model
    #predict_train = mlpr.predict(X_train_scaled)
        
    #predict test
    y_pred = mlpr.predict(X_test)
    print('Predict: ' , y_pred)
        
    df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
    print(df)
        
    #Show Errors
    print("---------ERRORS-----------")
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
    print('--------')
        
print("Done")

On iteration  50




Predict:  [ 177.93489806  289.18823242 1255.25746584 ...   88.06971347 1090.25531653
 1217.62324842]
       Actual     Predicted
6116      101    177.934898
9997        4    289.188232
2917      189   1255.257466
9731    10801  12740.828272
1386        5    425.923921
...       ...           ...
9057        2    110.849080
6871      717   1264.583778
10935       1     88.069713
7863      167   1090.255317
9650       28   1217.623248

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 875.2469234878233
Mean Squared Error: 4165269.74236045
Root Mean Squared Error: 2040.899248458985
--------
On iteration  100
Predict:  [ 2376.91997835   243.02135737  7535.64026694 ... 21335.82711818
  2204.89668066 36584.65160052]
       Actual     Predicted
6116      101   2376.919978
9997        4    243.021357
2917      189   7535.640267
9731    10801  48017.393213
1386        5   3261.288037
...       ...           ...
9057        2   9046.672457
6871      717  81902.625937
10935 



Predict:  [ 83.21159582  -2.22173382 191.42857466 ...  86.75233018 167.14745185
 161.08889697]
       Actual     Predicted
6116      101     83.211596
9997        4     -2.221734
2917      189    191.428575
9731    10801  10563.133757
1386        5      3.802178
...       ...           ...
9057        2     30.318509
6871      717   1112.318290
10935       1     86.752330
7863      167    167.147452
9650       28    161.088897

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 313.6459324534118
Mean Squared Error: 1047211.2198283933
Root Mean Squared Error: 1023.3333864525252
--------
On iteration  200
Predict:  [ 2785.64426167   361.43634644  8869.09127274 ... 24877.65280567
  2747.55075891 42771.63905969]
       Actual     Predicted
6116      101   2785.644262
9997        4    361.436346
2917      189   8869.091273
9731    10801  53003.425197
1386        5   3846.380116
...       ...           ...
9057        2  10558.270264
6871      717  95383.111502
10935    