In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing as pre

In [3]:
covid_data = pd.read_csv('covid_data_clean.csv')
countries_data = pd.read_csv('countries_data_clean.csv',decimal=',')

In [4]:
covid_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,Ellapsed Days
0,0,Afghanistan,2020-02-24,1,0,0,0
1,1,Afghanistan,2020-02-25,1,0,0,1
2,2,Afghanistan,2020-02-26,1,0,0,2
3,3,Afghanistan,2020-02-27,1,0,0,3
4,4,Afghanistan,2020-02-28,1,0,0,4
5,5,Afghanistan,2020-02-29,1,0,0,5
6,6,Afghanistan,2020-03-01,1,0,0,6
7,7,Afghanistan,2020-03-02,1,0,0,7
8,8,Afghanistan,2020-03-03,1,0,0,8
9,9,Afghanistan,2020-03-04,1,0,0,9


In [5]:
countries_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate
0,0,Afghanistan,31056997,48.0,163.07,700.0,20.34
1,1,Albania,3581655,124.6,21.52,4500.0,5.22
2,2,Algeria,32930091,13.8,31.0,6000.0,4.61
3,3,Samoa,57794,290.4,9.27,8000.0,3.27
4,4,Andorra,71201,152.1,4.05,19000.0,6.25
5,5,Angola,12127071,9.7,191.19,1900.0,24.2
6,7,Antigua and Barbuda,69108,156.0,19.46,11000.0,5.37
7,8,Argentina,39921833,14.4,15.18,11200.0,7.55
8,9,Armenia,2976372,99.9,23.28,3500.0,8.23
9,11,Australia,20264082,2.6,4.69,29000.0,7.51


In [6]:
#Merge the two columns together and drop uncessary columns
merged_inner = pd.merge(left=covid_data, right=countries_data, left_on='Country/Region', right_on='Country').drop(columns=['Country', 'Date', 'Unnamed: 0_x', 'Unnamed: 0_y'])

#Drop rows with at least one missing value
merged_inner.dropna(inplace=True)

def createPrevValues(dataset, originalColumnName, commonColumnName, newColumnName):
    dataset[newColumnName] = dataset.loc[merged_inner[commonColumnName].shift(-1)==dataset[commonColumnName], originalColumnName]
    dataset[newColumnName] = dataset[newColumnName].shift()
    dataset[newColumnName].fillna(0, inplace=True)

#Get cases of previous day
createPrevValues(merged_inner, 'Confirmed', 'Country/Region', 'Prev_Confirmed')

#Get deaths of previous day
createPrevValues(merged_inner, 'Deaths', 'Country/Region', 'Prev_Deaths')

#Get Recovered of previous day
createPrevValues(merged_inner, 'Recovered', 'Country/Region', 'Prev_Recovered')


def enconde_string(name):
    mybytes = name.encode('utf-8')
    return int.from_bytes(mybytes, 'little')

merged_inner['Country/Region'] = merged_inner['Country/Region'].apply(enconde_string)


merged_inner.head(50)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
0,133442057845059666670216769,1,0,0,0,31056997,48.0,163.07,700.0,20.34,0.0,0.0,0.0
1,133442057845059666670216769,1,0,0,1,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
2,133442057845059666670216769,1,0,0,2,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
3,133442057845059666670216769,1,0,0,3,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
4,133442057845059666670216769,1,0,0,4,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
5,133442057845059666670216769,1,0,0,5,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
6,133442057845059666670216769,1,0,0,6,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
7,133442057845059666670216769,1,0,0,7,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
8,133442057845059666670216769,1,0,0,8,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
9,133442057845059666670216769,1,0,0,9,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0


In [7]:
#Get set with the values that the model receives
tempx = merged_inner.drop(columns=[ 'Country/Region', 'Confirmed'])
X = {'Confimed' : tempx , 'Deaths' : merged_inner.drop(columns=[ 'Country/Region', 'Deaths'])}

#Values the model should predict
y = {'Confimed' : merged_inner['Confirmed']  , 'Deaths' : merged_inner['Deaths']}

In [10]:
from sklearn import metrics
#Test case
variables = ['Confimed' , 'Deaths']
    
#different nr of iterations to compare results
max_iter_array = [100, 150, 200, 250 , 300 , 350]


for variable in variables:
    #Setting up training sets
    
    X_train, X_test, y_train, y_test = train_test_split(X[variable], y[variable], test_size=0.2, random_state=0)
    print('On variable : ' , variable)
    
    for it in max_iter_array:
        mlpr = MLPRegressor(solver = "lbfgs" , max_iter=it)
        print('On iteration ', it)

        #need to fix date
        mlpr.fit(X_train , y_train)

        #predict model
        #predict_train = mlpr.predict(X_train_scaled)

        #predict test
        y_pred = mlpr.predict(X_test)
        print('Predict: ' , y_pred)

        df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
        print(df)

        #Show Errors
        print("---------ERRORS-----------")
        print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
        print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
        print('--------')
        #Values to compare with error -> less than 10% of the mean result is a good success
    
      
print("Done")

On variable :  Confimed
On iteration  100
Predict:  [161.37522822  62.06638044 375.04858275 ...  74.54515165 340.32557484
 344.38386766]
       Actual     Predicted
6116      101    161.375228
9997        4     62.066380
2917      189    375.048583
9731    10801  10903.438912
1386        5    100.246235
...       ...           ...
9057        2     54.469904
6871      717    842.516021
10935       1     74.545152
7863      167    340.325575
9650       28    344.383868

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 283.0817741875947
Mean Squared Error: 639663.126400255
Root Mean Squared Error: 799.789426286854
--------
On iteration  150


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [143.22867252  31.77752558 219.2825979  ...  20.75135399 194.37956204
 146.73194706]
       Actual     Predicted
6116      101    143.228673
9997        4     31.777526
2917      189    219.282598
9731    10801  10820.959623
1386        5     28.936971
...       ...           ...
9057        2     23.829019
6871      717    817.180219
10935       1     20.751354
7863      167    194.379562
9650       28    146.731947

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 282.5632694531001
Mean Squared Error: 784449.6594502858
Root Mean Squared Error: 885.691627740878
--------
On iteration  200
Predict:  [103.44222352 -20.61269235 126.33716219 ...  -4.16636121 107.33223583
 157.46490429]
       Actual     Predicted
6116      101    103.442224
9997        4    -20.612692
2917      189    126.337162
9731    10801  11459.256638
1386        5     -0.685077
...       ...           ...
9057        2      1.978420
6871      717    621.498921
10935       1     -4.166

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [137.79484264  52.66869104 352.5894744  ...  61.5698842  317.73885527
 321.18137221]
       Actual     Predicted
6116      101    137.794843
9997        4     52.668691
2917      189    352.589474
9731    10801  10947.614424
1386        5     89.517959
...       ...           ...
9057        2     43.849064
6871      717    813.535107
10935       1     61.569884
7863      167    317.738855
9650       28    321.181372

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 269.7998177286744
Mean Squared Error: 651781.1546387132
Root Mean Squared Error: 807.3296443453029
--------
On iteration  300
Predict:  [108.46033614  43.71967192 313.75488001 ...  29.02437734 295.50397329
 214.17189358]
       Actual     Predicted
6116      101    108.460336
9997        4     43.719672
2917      189    313.754880
9731    10801  10966.350265
1386        5     64.954063
...       ...           ...
9057        2     23.681128
6871      717    782.566069
10935       1     29.02

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [   5.21282933  -22.65250797  -89.30771931 ...  -33.66107997  -70.64458963
 -135.29411144]
       Actual   Predicted
6116        8    5.212829
9997        0  -22.652508
2917        0  -89.307719
9731      252  766.987945
1386        0  -35.037092
...       ...         ...
9057        0  -18.558683
6871       12 -149.714429
10935       0  -33.661080
7863        0  -70.644590
9650        0 -135.294111

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 310.61968597476533
Mean Squared Error: 1470330.828503631
Root Mean Squared Error: 1212.5719889984393
--------
On iteration  300


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 43.31199979  21.07568071  78.82937666 ...  43.18761085  54.48788332
 159.1820925 ]
       Actual   Predicted
6116        8   43.312000
9997        0   21.075681
2917        0   78.829377
9731      252  375.830716
1386        0   44.106409
...       ...         ...
9057        0   29.515014
6871       12   45.867067
10935       0   43.187611
7863        0   54.487883
9650        0  159.182092

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 119.17115406513345
Mean Squared Error: 81585.80827665002
Root Mean Squared Error: 285.6322955771108
--------
On iteration  350
Predict:  [ -14.30577878  -11.0498092  -212.17007696 ...  -51.42229859  -62.42150094
 -343.81720804]
       Actual   Predicted
6116        8  -14.305779
9997        0  -11.049809
2917        0 -212.170077
9731      252  496.971369
1386        0  -99.206663
...       ...         ...
9057        0  -35.722457
6871       12 -201.117200
10935       0  -51.422299
7863        0  -62.421501
9650  

In [11]:
merged_inner.describe()  

Unnamed: 0,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
count,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0
mean,8500.861,561.370866,2332.912184,34.777653,48414040.0,293.000466,35.00825,9.472936,8169.845,538.116484,2224.839133
std,51117.0,3537.549173,12026.504142,23.305229,163556300.0,1390.625236,35.220779,4.69934,49550.15,3425.9305,11632.311556
min,1.0,0.0,0.0,0.0,29251.0,1.8,2.29,2.41,0.0,0.0,0.0
25%,15.0,0.0,0.0,16.0,3102229.0,29.3,7.03,6.21,13.0,0.0,0.0
50%,138.0,2.0,12.0,32.0,9690222.0,77.4,20.97,8.3,124.0,2.0,11.0
75%,1445.0,31.0,201.0,50.0,32930090.0,182.2,55.51,11.11,1380.0,28.0,177.0
max,1257023.0,75662.0,195036.0,106.0,1313974000.0,16271.5,191.19,29.74,1228603.0,73431.0,189910.0
