In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing as pre

In [3]:
covid_data = pd.read_csv('covid_data_clean.csv')
countries_data = pd.read_csv('countries_data_clean.csv',decimal=',')

In [4]:
covid_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,Ellapsed Days
0,0,Afghanistan,2020-02-24,1,0,0,0
1,1,Afghanistan,2020-02-25,1,0,0,1
2,2,Afghanistan,2020-02-26,1,0,0,2
3,3,Afghanistan,2020-02-27,1,0,0,3
4,4,Afghanistan,2020-02-28,1,0,0,4
5,5,Afghanistan,2020-02-29,1,0,0,5
6,6,Afghanistan,2020-03-01,1,0,0,6
7,7,Afghanistan,2020-03-02,1,0,0,7
8,8,Afghanistan,2020-03-03,1,0,0,8
9,9,Afghanistan,2020-03-04,1,0,0,9


In [5]:
countries_data.head(10)

Unnamed: 0.1,Unnamed: 0,Country,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate
0,0,Afghanistan,31056997,48.0,163.07,700.0,20.34
1,1,Albania,3581655,124.6,21.52,4500.0,5.22
2,2,Algeria,32930091,13.8,31.0,6000.0,4.61
3,3,Samoa,57794,290.4,9.27,8000.0,3.27
4,4,Andorra,71201,152.1,4.05,19000.0,6.25
5,5,Angola,12127071,9.7,191.19,1900.0,24.2
6,7,Antigua and Barbuda,69108,156.0,19.46,11000.0,5.37
7,8,Argentina,39921833,14.4,15.18,11200.0,7.55
8,9,Armenia,2976372,99.9,23.28,3500.0,8.23
9,11,Australia,20264082,2.6,4.69,29000.0,7.51


In [6]:
#Merge the two columns together and drop uncessary columns
merged_inner = pd.merge(left=covid_data, right=countries_data, left_on='Country/Region', right_on='Country').drop(columns=['Country', 'Date', 'Unnamed: 0_x', 'Unnamed: 0_y'])

#Drop rows with at least one missing value
merged_inner.dropna(inplace=True)

def createPrevValues(dataset, originalColumnName, commonColumnName, newColumnName):
    dataset[newColumnName] = dataset.loc[merged_inner[commonColumnName].shift(-1)==dataset[commonColumnName], originalColumnName]
    dataset[newColumnName] = dataset[newColumnName].shift()
    dataset[newColumnName].fillna(0, inplace=True)

#Get cases of previous day
createPrevValues(merged_inner, 'Confirmed', 'Country/Region', 'Prev_Confirmed')

#Get deaths of previous day
createPrevValues(merged_inner, 'Deaths', 'Country/Region', 'Prev_Deaths')

#Get Recovered of previous day
createPrevValues(merged_inner, 'Recovered', 'Country/Region', 'Prev_Recovered')


merged_inner.head(50)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),GDP ($ per capita),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
0,Afghanistan,1,0,0,0,31056997,48.0,163.07,700.0,20.34,0.0,0.0,0.0
1,Afghanistan,1,0,0,1,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
2,Afghanistan,1,0,0,2,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
3,Afghanistan,1,0,0,3,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
4,Afghanistan,1,0,0,4,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
5,Afghanistan,1,0,0,5,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
6,Afghanistan,1,0,0,6,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
7,Afghanistan,1,0,0,7,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
8,Afghanistan,1,0,0,8,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0
9,Afghanistan,1,0,0,9,31056997,48.0,163.07,700.0,20.34,1.0,0.0,0.0


In [7]:
#Get set with the values that the model receives
X = {'Confirmed' : merged_inner.drop(columns=[ 'Country/Region', 'Confirmed']) , 'Deaths' : merged_inner.drop(columns=[ 'Country/Region', 'Deaths'])}

#Values the model should predict
y = {'Confirmed' : merged_inner['Confirmed']  , 'Deaths' : merged_inner['Deaths']}

In [8]:
from sklearn import metrics
#Test case
variables = ['Confirmed' , 'Deaths']
    
#different nr of iterations to compare results
max_iter_array = [100, 150, 200, 250 , 300 , 350]


for variable in variables:
    #Setting up training sets
    
    X_train, X_test, y_train, y_test = train_test_split(X[variable], y[variable], test_size=0.2, random_state=0)
    print('On variable : ' , variable)
    
    for it in max_iter_array:
        print('On iteration ', it)
        mlpr = MLPRegressor(solver = "lbfgs" , max_iter=it)

        #need to fix date
        mlpr.fit(X_train , y_train)

        #predict model
        #predict_train = mlpr.predict(X_train_scaled)

        #predict test
        y_pred = mlpr.predict(X_test)
        print('Predict: ' , y_pred)

        df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
        print(df)

        #Show Errors
        print("---------ERRORS-----------")
        print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
        print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
        print('--------')
    
      
print("Done")

On variable :  Confirmed
On iteration  100
Predict:  [ 80.87837805  43.48376759 358.29044256 ...  20.30222717 320.47106054
 325.91855441]
       Actual     Predicted
6116      101     80.878378
9997        4     43.483768
2917      189    358.290443
9731    10801  11173.867250
1386        5     82.866166
...       ...           ...
9057        2     13.147837
6871      717    773.373841
10935       1     20.302227
7863      167    320.471061
9650       28    325.918554

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 333.91400570028503
Mean Squared Error: 905823.4437701805
Root Mean Squared Error: 951.7475735562348
--------
On iteration  150
Predict:  [ 85.80258944 -26.82283414  88.30539039 ...  33.96640504  84.16187795
  43.91961797]
       Actual     Predicted
6116      101     85.802589
9997        4    -26.822834
2917      189     88.305390
9731    10801  10450.521069
1386        5    -18.747082
...       ...           ...
9057        2      6.063434
6871   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [133.90598053  85.1945013  462.87839532 ...  48.48191155 420.63901774
 376.15181737]
       Actual     Predicted
6116      101    133.905981
9997        4     85.194501
2917      189    462.878395
9731    10801  10975.483035
1386        5    124.315589
...       ...           ...
9057        2     43.357403
6871      717    879.676226
10935       1     48.481912
7863      167    420.639018
9650       28    376.151817

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 346.27817178457207
Mean Squared Error: 794261.7512129842
Root Mean Squared Error: 891.2136394899845
--------
On iteration  250
Predict:  [120.82579837   0.37295012 357.40629666 ...  48.11664508 223.39204238
 314.32353891]
       Actual     Predicted
6116      101    120.825798
9997        4      0.372950
2917      189    357.406297
9731    10801  10901.095372
1386        5     86.624163
...       ...           ...
9057        2     34.555431
6871      717    818.504697
10935       1     48.1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 21.11355647   9.30204524  17.21457747 ... -31.95175582  33.11463653
 -52.62960771]
       Actual   Predicted
6116        8   21.113556
9997        0    9.302045
2917        0   17.214577
9731      252  915.895710
1386        0    3.502571
...       ...         ...
9057        0   -8.981626
6871       12  -85.094606
10935       0  -31.951756
7863        0   33.114637
9650        0  -52.629608

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 313.2457857055314
Mean Squared Error: 1408642.3978265284
Root Mean Squared Error: 1186.862417395769
--------
On iteration  150


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 73.37527414 134.46145723 553.80436128 ... 103.06095288 453.4234574
 688.57099127]
       Actual    Predicted
6116        8    73.375274
9997        0   134.461457
2917        0   553.804361
9731      252  1010.042335
1386        0   210.935190
...       ...          ...
9057        0    86.387031
6871       12   611.188331
10935       0   103.060953
7863        0   453.423457
9650        0   688.570991

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 762.4943204675361
Mean Squared Error: 3217290.552709303
Root Mean Squared Error: 1793.6807276405973
--------
On iteration  200
Predict:  [  -3.91935459  -13.43538845  -51.95886654 ...  -44.43129811  -38.96194952
 -137.49503142]
       Actual   Predicted
6116        8   -3.919355
9997        0  -13.435388
2917        0  -51.958867
9731      252  608.630810
1386        0  -28.951976
...       ...         ...
9057        0  -22.706118
6871       12 -103.941403
10935       0  -44.431298
7863        0  -38.96

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Predict:  [ 17.61219859  72.66309962 278.67056439 ...   3.97202016 244.61372597
 250.59237603]
       Actual   Predicted
6116        8   17.612199
9997        0   72.663100
2917        0  278.670564
9731      252  512.704368
1386        0   98.743933
...       ...         ...
9057        0   17.991309
6871       12  185.763322
10935       0    3.972020
7863        0  244.613726
9650        0  250.592376

[2317 rows x 2 columns]
---------ERRORS-----------
Mean Absolute Error: 372.9919604321282
Mean Squared Error: 1320491.8966553877
Root Mean Squared Error: 1149.1265799098844
--------
On iteration  350
Predict:  [ -2.20920601  32.65724903 136.09013366 ...   4.89148513 113.61939499
 160.31606285]
       Actual   Predicted
6116        8   -2.209206
9997        0   32.657249
2917        0  136.090134
9731      252  455.305652
1386        0   52.695024
...       ...         ...
9057        0    8.627256
6871       12   65.902914
10935       0    4.891485
7863        0  113.619395
9650       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [9]:
#Values to compare with error -> less than 10% of the mean result is a good success
merged_inner.describe()  

Unnamed: 0,Confirmed,Deaths,Recovered,Ellapsed Days,Population,Pop. Density (per sq. mi.),Infant mortality (per 1000 births),Deathrate,Prev_Confirmed,Prev_Deaths,Prev_Recovered
count,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0,11581.0
mean,8500.861,561.370866,2332.912184,34.777653,48414040.0,293.000466,35.00825,9.472936,8169.845,538.116484,2224.839133
std,51117.0,3537.549173,12026.504142,23.305229,163556300.0,1390.625236,35.220779,4.69934,49550.15,3425.9305,11632.311556
min,1.0,0.0,0.0,0.0,29251.0,1.8,2.29,2.41,0.0,0.0,0.0
25%,15.0,0.0,0.0,16.0,3102229.0,29.3,7.03,6.21,13.0,0.0,0.0
50%,138.0,2.0,12.0,32.0,9690222.0,77.4,20.97,8.3,124.0,2.0,11.0
75%,1445.0,31.0,201.0,50.0,32930090.0,182.2,55.51,11.11,1380.0,28.0,177.0
max,1257023.0,75662.0,195036.0,106.0,1313974000.0,16271.5,191.19,29.74,1228603.0,73431.0,189910.0
