# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium

In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [3]:
# install - 03.05.21
#!pip install scikit-optimize

### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [4]:
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
#Traininsdatensatz verkleinern - 1000 Einträge
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy()[:10000,:], y.to_numpy()[:10000], test_size=0.2, random_state=0) 

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

In [10]:
linReg = LinearRegression().fit(X_train, y_train)

In [62]:
linReg.score(X_train, y_train)

0.6976535734241529

In [63]:
linReg.score(X_test, y_test)

0.6216514934419313

### Random Forest

In [45]:
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')
import numpy as np

In [50]:
optRF = BayesSearchCV(
    RandomForestRegressor(),
    {
        'n_estimators': np.arange(50, 400, 25),
        'max_depth': np.arange(1, 21, 1)
    },
    cv=5, 
    n_iter=40,
    random_state=0,
    n_jobs=-1
)

In [51]:
%%time
optRF.fit(X_train, y_train)

Wall time: 15min 35s


BayesSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=40, n_jobs=-1,
              random_state=0,
              search_spaces={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20]),
                             'n_estimators': array([ 50,  75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350,
       375])})

In [52]:
optRF.best_params_

OrderedDict([('max_depth', 10), ('n_estimators', 225)])

In [53]:
optRF.best_score_

0.7859541794567086

In [65]:
optRF.score(X_test, y_test)

0.8272197845767917

### SVM Regression

In [67]:
from skopt import BayesSearchCV
from sklearn.svm import SVR
import numpy as np

In [73]:
optSVM = BayesSearchCV(
    SVR(kernel='rbf'),
    {
        'C': np.arange(0.1, 10.1, 0.1),
        'gamma': [0.0000000001, 0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001,0.0001, 0.001, 0.01, 0,1],
    },
    cv=5, 
    n_iter=50,
    random_state=0,
    n_jobs=-1
)

In [74]:
%%time
optSVM.fit(X_train, y_train)

Wall time: 6min 34s


BayesSearchCV(cv=5, estimator=SVR(), n_jobs=-1, random_state=0,
              search_spaces={'C': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
        2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
        3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
        4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
        5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
        6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
        7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
        8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
       10. ]),
                             'gamma': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05,
                                       0.0001, 0.001, 0.01, 0, 1]})

In [75]:
optSVM.best_params_

OrderedDict([('C', 9.700000000000001), ('gamma', 0.001)])

In [76]:
optSVM.best_score_

0.7649684820892445

In [77]:
optSVM.score(X_test, y_test)

0.7856869303972172