In [None]:
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn import tree
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
n_samples = 500
np.random.seed(100)
t = np.linspace(0, 2*np.pi, n_samples)
u = 1 + 2*np.sin(t)
σt,σu = 0.05, 0.1
ηt = σt * np.random.randn(n_samples)
ηu = σu * np.random.randn(n_samples)

In [None]:
MLP_n_hidden = 1000
models = {
    'SVR': NuSVR(verbose=False),
    'decision_tree': DecisionTreeRegressor(max_depth=10),
    'random_forest': RandomForestRegressor(n_estimators=10, max_depth=10, bootstrap=True),
    'kernel_ridge': KernelRidge(alpha=0.1, kernel='rbf'),
    'nearest_neighbors': KNeighborsRegressor(weights='distance',
                                             algorithm='brute'),
    'MLP': MLPRegressor(hidden_layer_sizes=(MLP_n_hidden,),
                        max_iter=10000)
}
with_scaler = True
if with_scaler:
    pipe = lambda regr: make_pipeline(StandardScaler(), regr)
    regressors = {k: pipe(v) for k,v in models.items()}
else:
    regressors = models

In [None]:
X_train = (t + ηt).reshape(-1,1)
y_train = u + ηu
X_test = t.reshape(-1,1)
y_test, MAE, MAPE = {}, {}, {}
for name,regr in regressors.items():
    regr.fit(X_train, y_train)
    y_test[name] = regr.predict(X_test)
    MAE[name] = mean_absolute_error(u, y_test[name].squeeze())
    MAPE[name] = mean_absolute_percentage_error(u, y_test[name].squeeze())

### Number of parameters of each model

The number of parameters in a `MLP` with one hidden layer is given by:<br/>
`n_inputs * n_hidden + n_hidden + n_outputs * n_hidden + n_outputs`

The number of parameters in a `NuSVR` is given by the size of the support vectors.

The number of parameters in a `KernelRidgeRegressor` is given by: `n_samples * n_outputs`

The number of parameters in a `KNearestNeighborsRegressor` is 0.

The number of parameters in a `DecisionTree` is the # of nodes (each leaf has a `value` parameter, while regular nodes have a `threshold` parameter).

The number of parameters in a `RandomForest` is the sum of the # of nodes in each tree.

In [None]:
n_pars = {}
n_pars['MLP'] = np.sum([x.size for x in models['MLP'].coefs_]) + \
    np.sum([x.size for x in models['MLP'].intercepts_])
n_pars['SVR'] = models['SVR'].support_vectors_.size
n_pars['kernel_ridge'] = models['kernel_ridge'].dual_coef_.size
n_pars['nearest_neighbors'] = 0
n_pars['decision_tree'] = models['decision_tree'].tree_.node_count
n_pars['random_forest'] = np.sum([regr.tree_.node_count for regr in models['random_forest'].estimators_])

In [None]:
print('{:20s} {:^6s}  {:^4s}   {:^6s}'.format('Model', 'MAE', 'MAPE', '# pars'))
print('=' * 42)
for name,num in n_pars.items():
    print('{:20s} {:6.4f}  {:4.2f}%  {:6d}'.format(name.replace('_',' '), MAE[name], MAPE[name], num))

In [None]:
ms = 5
fig,ax = plt.subplots(1, 1, figsize=(7,5))
ax.plot(t, u, 'k', lw=3, label='True')
ax.plot(X_train, y_train, 'gs', markeredgewidth=0.5, markersize=ms-2,
        markerfacecolor='w', label='Training set', alpha=0.75)
cmap = plt.get_cmap('tab10', len(models))
ds = 10
for i,name in enumerate(y_test):
    ax.plot(X_test[::ds], y_test[name][::ds], 'o', color=cmap(i),
            markersize=ms, markerfacecolor='w', label=name)
ax.legend(loc='lower left', frameon=False)
ax.set_xlabel('Time (s)')
ax.set_ylabel('Y')
sns.despine()

In [None]:
if models['decision_tree'].max_depth <= 5:
    fig = plt.figure(figsize=(12,5))
    _ = tree.plot_tree(models['decision_tree'])

In [None]:
if models['random_forest'].estimators_[0].tree_.max_depth <= 5:
    fig = plt.figure(figsize=(12,5))
    _ = tree.plot_tree(models['random_forest'].estimators_[0])

In [None]:
X = np.array([[2]])
y1 = np.mean([estim.predict(X)[0] for estim in models['random_forest'].estimators_])
y2 = models['random_forest'].predict(X)[0]
y1 - y2 < 1e-12