In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing as pre

In [3]:
covid_data = pd.read_csv('covid_data_clean.csv')
countries_data = pd.read_csv('countries_data_clean.csv',decimal=',')

In [4]:
covid_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,Ellapsed Days
0,0,Afghanistan,2020-02-24,1,0,0,0
1,1,Afghanistan,2020-02-25,1,0,0,1
2,2,Afghanistan,2020-02-26,1,0,0,2
3,3,Afghanistan,2020-02-27,1,0,0,3
4,4,Afghanistan,2020-02-28,1,0,0,4
5,5,Afghanistan,2020-02-29,1,0,0,5
6,6,Afghanistan,2020-03-01,1,0,0,6
7,7,Afghanistan,2020-03-02,1,0,0,7
8,8,Afghanistan,2020-03-03,1,0,0,8
9,9,Afghanistan,2020-03-04,1,0,0,9


In [5]:
countries_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate
0,0,Afghanistan,31056997,48.0,163.07,700.0,20.34
1,1,Albania,3581655,124.6,21.52,4500.0,5.22
2,2,Algeria,32930091,13.8,31.0,6000.0,4.61
3,3,Samoa,57794,290.4,9.27,8000.0,3.27
4,4,Andorra,71201,152.1,4.05,19000.0,6.25
5,5,Angola,12127071,9.7,191.19,1900.0,24.2
6,7,Antigua and Barbuda,69108,156.0,19.46,11000.0,5.37
7,8,Argentina,39921833,14.4,15.18,11200.0,7.55
8,9,Armenia,2976372,99.9,23.28,3500.0,8.23
9,11,Australia,20264082,2.6,4.69,29000.0,7.51


In [6]:
#Merge the two columns together and drop uncessary columns
merged_inner = pd.merge(left=covid_data, right=countries_data, left_on='Country/Region', right_on='Country').drop(columns=['Country', 'Date', 'Unnamed: 0_x', 'Unnamed: 0_y'])

#Drop rows with at least one missing value
merged_inner.dropna(inplace=True)

def createPrevValues(dataset, originalColumnName, commonColumnName, newColumnName):
    dataset[newColumnName] = dataset.loc[merged_inner[commonColumnName].shift(-1)==dataset[commonColumnName], originalColumnName]
    dataset[newColumnName] = dataset[newColumnName].shift()
    dataset[newColumnName].fillna(0, inplace=True)

#Get cases of previous day
createPrevValues(merged_inner, 'Confirmed', 'Country/Region', 'Prev_Confirmed')

#Get deaths of previous day
createPrevValues(merged_inner, 'Deaths', 'Country/Region', 'Prev_Deaths')

#Get Recovered of previous day
createPrevValues(merged_inner, 'Recovered', 'Country/Region', 'Prev_Recovered')


merged_inner.head(50)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
0,Afghanistan,1,0,0,0,31056997,48.0,163.07,700.0,20.34,0.0,0.0,0.0
1,Afghanistan,1,0,0,1,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
2,Afghanistan,1,0,0,2,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
3,Afghanistan,1,0,0,3,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
4,Afghanistan,1,0,0,4,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
5,Afghanistan,1,0,0,5,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
6,Afghanistan,1,0,0,6,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
7,Afghanistan,1,0,0,7,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
8,Afghanistan,1,0,0,8,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
9,Afghanistan,1,0,0,9,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0


In [7]:
#Get set with the values that the model receives
X = {'Confirmed' : merged_inner.drop(columns=[ 'Country/Region', 'Confirmed', 'Recovered']) , 'Deaths' : merged_inner.drop(columns=[ 'Country/Region', 'Deaths', 'Recovered'])}

#Values the model should predict
y = {'Confirmed' : merged_inner['Confirmed']  , 'Deaths' : merged_inner['Deaths']}

In [8]:
from sklearn import metrics
#Test case
variables = ['Confirmed' , 'Deaths']
    
#different nr of iterations to compare results
max_iter_array = [150, 200, 250 , 300 , 350]


for variable in variables:
    #Setting up training sets
    
    X_train, X_test, y_train, y_test = train_test_split(X[variable], y[variable], test_size=0.2, random_state=0)
    print('On variable : ' , variable)
    
    for it in max_iter_array:
        print('On iteration ', it)
        mlpr = MLPRegressor(solver = "lbfgs" , max_iter=it)

        #need to fix date
        mlpr.fit(X_train , y_train)

        #predict model
        #predict_train = mlpr.predict(X_train_scaled)

        #predict test
        y_pred = mlpr.predict(X_test)
        print('Predict: ' , y_pred)

        df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
        print(df)

        #Show Errors
        print("---------ERRORS-----------")
        print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
        print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
        print('--------')
    
      
print("Done")

On variable :  Confirmed
On iteration  100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 175.07799674  206.14695685  997.71425692 ...   31.94956869  828.07530993
 1043.8982713 ]
       Actual     Predicted
6116      101    175.077997
9997        4    206.146957
2917      189    997.714257
9731    10801  12973.468961
1386        5    343.033344
...       ...           ...
9057        2     81.633517
6871      717    907.656779
10935       1     31.949569
7863      167    828.075310
9650       28   1043.898271

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 867.1112959819386
Mean Squared Error: 3914194.6866746834
Root Mean Squared Error: 1978.4323811226614
--------
On iteration  150
Predict:  [129.00825224   0.28475494 184.19810099 ...  77.22808811 154.73568461
 220.99426955]
       Actual     Predicted
6116      101    129.008252
9997        4      0.284755
2917      189    184.198101
9731    10801  10737.208302
1386        5     30.709006
...       ...           ...
9057        2     38.158395
6871      717    757.490507
10935       1  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [119.86878522  99.28251501 494.7024381  ...  23.58291813 471.86985386
 332.71841261]
       Actual     Predicted
6116      101    119.868785
9997        4     99.282515
2917      189    494.702438
9731    10801  10929.516809
1386        5    126.012018
...       ...           ...
9057        2     32.563538
6871      717    917.412379
10935       1     23.582918
7863      167    471.869854
9650       28    332.718413

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 388.6257459615661
Mean Squared Error: 985234.0546028562
Root Mean Squared Error: 992.5895700655212
--------
On iteration  250


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [135.07869904  60.4553598  339.72967411 ...  15.00470898 343.70074528
 180.01571007]
       Actual     Predicted
6116      101    135.078699
9997        4     60.455360
2917      189    339.729674
9731    10801  11018.435709
1386        5     71.395232
...       ...           ...
9057        2     21.387257
6871      717    789.803274
10935       1     15.004709
7863      167    343.700745
9650       28    180.015710

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 329.5492435900642
Mean Squared Error: 1033925.7180845265
Root Mean Squared Error: 1016.8213796358368
--------
On iteration  300
Predict:  [141.97439292  52.30534066 309.86912017 ...  51.16136502 295.44297756
 244.18903331]
       Actual     Predicted
6116      101    141.974393
9997        4     52.305341
2917      189    309.869120
9731    10801  10835.820021
1386        5     68.601268
...       ...           ...
9057        2     35.573327
6871      717    817.927391
10935       1     51.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 33.8769863  119.69352136 460.97872425 ... -11.79744002 396.11514769
 442.69474132]
       Actual   Predicted
6116        8   33.876986
9997        0  119.693521
2917        0  460.978724
9731      252  850.948904
1386        0  175.521840
...       ...         ...
9057        0   25.970340
6871       12   93.580362
10935       0  -11.797440
7863        0  396.115148
9650        0  442.694741

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 530.3771828479784
Mean Squared Error: 1778867.827484642
Root Mean Squared Error: 1333.7420393331845
--------
On iteration  150
Predict:  [ 30.64568945  89.28221529 378.21474014 ...   6.33837401 310.14343402
 401.13374324]
       Actual    Predicted
6116        8    30.645689
9997        0    89.282215
2917        0   378.214740
9731      252  1500.373904
1386        0   142.611558
...       ...          ...
9057        0    26.258817
6871       12   143.235202
10935       0     6.338374
7863        0   310.143434
9

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [-25.78287786  32.80134554 151.8997081  ... -53.09896868 129.40780877
  85.65483541]
       Actual    Predicted
6116        8   -25.782878
9997        0    32.801346
2917        0   151.899708
9731      252  1024.720384
1386        0    46.333505
...       ...          ...
9057        0   -19.446464
6871       12   -43.276137
10935       0   -53.098969
7863        0   129.407809
9650        0    85.654835

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 421.6171051076469
Mean Squared Error: 2106500.6290899743
Root Mean Squared Error: 1451.3788716561828
--------
On iteration  250


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 20.40614901   0.8720957   -0.55743872 ...  -9.03965881  -2.03408395
 -13.35203424]
       Actual   Predicted
6116        8   20.406149
9997        0    0.872096
2917        0   -0.557439
9731      252  690.018006
1386        0    2.995984
...       ...         ...
9057        0   -0.034129
6871       12  -77.366675
10935       0   -9.039659
7863        0   -2.034084
9650        0  -13.352034

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 216.10101137232158
Mean Squared Error: 545488.1316325499
Root Mean Squared Error: 738.5716834759845
--------
On iteration  300


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 43.14310238  14.64594652  98.866262   ...  58.14196249 124.77539968
 146.86935335]
       Actual   Predicted
6116        8   43.143102
9997        0   14.645947
2917        0   98.866262
9731      252  209.965707
1386        0   44.114501
...       ...         ...
9057        0   38.777873
6871       12  174.524124
10935       0   58.141962
7863        0  124.775400
9650        0  146.869353

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 190.28186823473732
Mean Squared Error: 317298.57060037536
Root Mean Squared Error: 563.2926154321352
--------
On iteration  350
Predict:  [  8.68581412  28.6217731  116.55715618 ... -19.11228101 101.82907943
 105.99385653]
       Actual   Predicted
6116        8    8.685814
9997        0   28.621773
2917        0  116.557156
9731      252  989.893429
1386        0   45.344876
...       ...         ...
9057        0   -0.715336
6871       12  -44.926620
10935       0  -19.112281
7863        0  101.829079
9650       

In [9]:
#Values to compare with error -> less than 10% of the mean result is a good success
merged_inner.describe()  

Unnamed: 0,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
count,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0
mean,8500.861,561.370866,2332.912184,34.777653,48414040.0,293.000466,35.00825,9.472936,8169.845,538.116484,2224.839133
std,51117.0,3537.549173,12026.504142,23.305229,163556300.0,1390.625236,35.220779,4.69934,49550.15,3425.9305,11632.311556
min,1.0,0.0,0.0,0.0,29251.0,1.8,2.29,2.41,0.0,0.0,0.0
25%,15.0,0.0,0.0,16.0,3102229.0,29.3,7.03,6.21,13.0,0.0,0.0
50%,138.0,2.0,12.0,32.0,9690222.0,77.4,20.97,8.3,124.0,2.0,11.0
75%,1445.0,31.0,201.0,50.0,32930090.0,182.2,55.51,11.11,1380.0,28.0,177.0
max,1257023.0,75662.0,195036.0,106.0,1313974000.0,16271.5,191.19,29.74,1228603.0,73431.0,189910.0
