Using the covid-19 dataset, perform the following activities:
<ol>
    <li>Introduce the "reduce" of rmse with the different regressors (LinearRegressor, DecisionTreeRegressor, RandomForestRegressor) and using GridSearchCV to define the best parameter set for RandomForest </li>
     <li>Investigate using Support Vector Regressor (sklearn.svm.SVR) by automatically varying the hyperparameters (kernel and C) and display the RMSE</li>
</ol>

### Imports

In [47]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#### pickles

In [48]:
def read_pickle(name):
    with (open(name, 'rb')) as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance

In [49]:
X_train = read_pickle('X_train.pickle')
X_test = read_pickle('X_test.pickle')
y_train = read_pickle('y_train.pickle')
y_test = read_pickle('y_test.pickle')

In [50]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(113319, 8) (28330, 8) (113319,) (28330,)


<ol>
    <li> Introduce the "reduce" of rmse with the different regressors (LinearRegressor, DecisionTreeRegressor, RandomForestRegressor) and using GridSearchCV to define the best parameter set for RandomForest </li>
</ol>

### Linear Regression model training

In [51]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train) # y = ax + b

LinearRegression()

In [52]:
predictions = lin_reg.predict(X_test)

In [53]:
print(predictions.shape)
print(X_test)

(28330,)
[[-0.15693134 -0.89557775 -0.16444942 ... -0.44246783 -0.04054729
  -1.71091024]
 [ 0.06248738  1.59236828  0.08743955 ... -0.24793303 -0.04054729
   0.65435965]
 [-0.13032957 -1.14600558 -0.16811707 ... -0.44246783 -0.04054729
  -1.67999168]
 ...
 [-0.1404199   0.26401197 -0.14699534 ... -0.24486949 -0.04054729
  -1.32442823]
 [-0.09730669  0.39466997 -0.11252925 ... -0.24640126 -0.04054729
   0.11328484]
 [-0.16353592 -1.09156475 -0.1769751  ...  0.08599283 -0.04054729
   0.40701116]]


#### Linear Regression model evaluation

In [54]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

11.67034187195746


### selecting the Regressor Decision Tree

In [55]:
from sklearn.tree import DecisionTreeRegressor

In [56]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [57]:
dt_reg.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [58]:
predictions = dt_reg.predict(X_test)
#predictions.shape

In [59]:
dt_mse = mean_squared_error(y_test, predictions)
dt_rmse = np.sqrt(dt_mse)
print(dt_rmse)

1.064343643499513


### Random Forest Regressor

In [60]:
from sklearn.ensemble import RandomForestRegressor

In [61]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

RandomForestRegressor()

In [62]:
predictions = rf_reg.predict(X_test)
#predictions.shape

In [63]:
rf_mse = mean_squared_error(y_test, predictions)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

0.4549705753618077


#### feature selection

In [64]:
rf_reg.feature_importances_

array([9.35996753e-01, 3.23386397e-02, 2.78298232e-04, 1.96039841e-04,
       2.61139790e-02, 4.87218428e-03, 1.56549783e-08, 2.04090181e-04])

In [66]:
pd.read_csv('data_piaui.csv').columns

Index(['confirmed', 'order_for_place', 'estimated_population_2019',
       'estimated_population', 'confirmed_per_100k_inhabitants', 'death_rate',
       'is_last', 'city'],
      dtype='object')

In [67]:
# remove
# 3 estimated_population
# 6 is_last
X_train_feat_selected = np.delete(X_train, 6, 1)
X_train_feat_selected.shape

(113319, 7)

In [68]:
X_train_feat_selected = np.delete(X_train_feat_selected, 3, 1)
X_train_feat_selected.shape

(113319, 6)

In [69]:
X_test_feat_selected = np.delete(X_test, 6, 1)
X_test_feat_selected = np.delete(X_test_feat_selected, 3, 1)
X_test_feat_selected.shape

(28330, 6)

#### Random Forest Regressor with selected features

In [70]:
rf_reg_s = RandomForestRegressor()
rf_reg_s.fit(X_train_feat_selected, y_train)

RandomForestRegressor()

In [71]:
rf_predictions = rf_reg_s.predict(X_test_feat_selected)
predictions.shape

(28330,)

In [72]:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

0.35972757853865167


### model hyperparameter adjustment

In [73]:
rf_reg_s.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [74]:
from sklearn.model_selection import GridSearchCV

In [75]:
# 'bootstrap': [False] -> uses the ENTIRE training set to train each of the binary trees
# 'n_estimators': [80, 90, 100, 120] -> determines the number of trees I will use / default is 100
# 'n_jobs': [-1] -> will use all processors on my machine / 0 -> use only 1 processor
param_grid = [
             {'n_estimators': [80, 90, 100, 120], 'max_features': [2, 4, 6], 'bootstrap': [False], 'n_jobs': [-1]}
]

In [76]:
rf_reg_best = RandomForestRegressor()

In [77]:
# cv=5 -> cross validation - guarantees that all data will be used for training and testing
grid_search = GridSearchCV(rf_reg_best, param_grid, cv = 5, scoring='neg_root_mean_squared_error')

In [78]:
grid_search.fit(X_train_feat_selected, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'bootstrap': [False], 'max_features': [2, 4, 6],
                          'n_estimators': [80, 90, 100, 120], 'n_jobs': [-1]}],
             scoring='neg_root_mean_squared_error')

In [79]:
# best parameters
grid_search.best_params_

{'bootstrap': False, 'max_features': 4, 'n_estimators': 90, 'n_jobs': -1}

In [80]:
# using the best parameters in the model
rf_reg_best = RandomForestRegressor(max_features=4, n_estimators=90, bootstrap=False, n_jobs=-1)
rf_reg_best.fit(X_train_feat_selected, y_train)

RandomForestRegressor(bootstrap=False, max_features=4, n_estimators=90,
                      n_jobs=-1)

In [81]:
rf_best_predictions = rf_reg_best.predict(X_test_feat_selected)
#rf_best_predictions.shape

In [82]:
rf_mse = mean_squared_error(y_test, rf_best_predictions)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

0.24428669168050268


#### selection characteristics of the best regressor - Random Forest Regressor

In [83]:
rf_reg_best.feature_importances_

array([0.68302135, 0.04164834, 0.19562653, 0.05539346, 0.00409034,
       0.02021999])

In [84]:
min(rf_reg_best.feature_importances_)

0.004090337045561984

In [87]:
X_train_feat_selected = np.delete(X_train_feat_selected, 4, 1)
X_test_feat_selected = np.delete(X_test_feat_selected, 4, 1)

In [88]:
rf_reg_best_s = RandomForestRegressor(max_features=4, n_estimators=90, bootstrap=False, n_jobs=-1)
rf_reg_best_s.fit(X_train_feat_selected, y_train)

RandomForestRegressor(bootstrap=False, max_features=4, n_estimators=90,
                      n_jobs=-1)

In [89]:
rf_best_predictions_s = rf_reg_best_s.predict(X_test_feat_selected)

In [90]:
rf_mse = mean_squared_error(y_test, rf_best_predictions_s)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

0.1798341460651143


#### one more time

In [91]:
rf_reg_best_s.feature_importances_

array([0.78599309, 0.04020972, 0.14278413, 0.02978447, 0.0012286 ])

In [92]:
min(rf_reg_best_s.feature_importances_)

0.0012285971851190441

In [94]:
X_train_feat_selected_last = np.delete(X_train_feat_selected, 4, 1)
X_test_feat_selected_last = np.delete(X_test_feat_selected, 4, 1)

In [95]:
rf_reg_best_last = RandomForestRegressor(max_features=4, n_estimators=90, bootstrap=False, n_jobs=-1)
rf_reg_best_last.fit(X_train_feat_selected_last, y_train)

RandomForestRegressor(bootstrap=False, max_features=4, n_estimators=90,
                      n_jobs=-1)

In [96]:
rf_best_predictions_last = rf_reg_best_last.predict(X_test_feat_selected_last)

In [97]:
rf_mse = mean_squared_error(y_test, rf_best_predictions_last)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

0.37803856662875357


2. Investigate using Support Vector Regressor (sklearn.svm.SVR) automatically varying the hyperparameters (kernel and C) and presenting the RMSE

In [98]:
from sklearn.svm import SVR

In [99]:
# RBF padrão do kernel
# 1.0 padrão do C
svr = SVR()
svr.fit(X_train, y_train)

SVR()

In [100]:
svr_predictions = svr.predict(X_test)
#svr_predictions.shape

In [101]:
rf_mse = mean_squared_error(y_test, svr_predictions)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

11362.909742251406


#### kernel - linear / c = 0.1

In [102]:
# kernel - é um conjunto de funções matemáticas que recebe dados como entrada e os transforma na forma necessária
svr_linear = SVR(kernel='linear', C=0.1)
svr_linear.fit(X_train, y_train)

SVR(C=0.1, kernel='linear')

In [103]:
svr_predictions_linear = svr_linear.predict(X_test)
#svr_predictions.shape

In [104]:
rf_mse = mean_squared_error(y_test, svr_predictions_linear)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

177.190771506698


#### kernel - poly / c = 0.5

In [106]:
svr_poly = SVR(kernel='poly', C=0.5)
svr_poly.fit(X_train, y_train)

SVR(C=0.5, kernel='poly')

In [107]:
svr_predictions_poly = svr_poly.predict(X_test)
#svr_predictions.shape

In [108]:
rf_mse = mean_squared_error(y_test, svr_predictions_poly)
rf_rmse = np.sqrt(rf_mse)
print(rf_mse)

183.33249351363776
