# Decision trees vs linear models

In [1]:
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.datasets import make_friedman1
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

make_regression - генерируется случайная линейная зависимость


make_friedman1 - y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1) - нелинейная зависимость.

## Линейная зависимость

In [2]:
X_data, y_data = make_regression(n_samples=1000, noise=100, n_features=10)

Меняем максимальную глубину дерева

In [3]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=1), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-27506.035569106723

In [4]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=5), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-23876.630060593816

In [5]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-30443.871859172068

Меняем минимальное количество примеров в листе

In [6]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=2), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-31210.109586482235

In [7]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-24286.844591280606

In [8]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=20), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-23320.949698896344

Подберем оптимальные параметры

In [9]:
%%time
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        'max_depth': range(1, 21, 3),
        'min_samples_leaf': range(1, 21, 3),
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print(gs.best_params_)
print(gs.best_score_)



{'criterion': 'mae', 'max_depth': 7, 'min_samples_leaf': 13}
-22842.81877908562
CPU times: user 6.38 s, sys: 39 ms, total: 6.42 s
Wall time: 6.6 s




Сравним с линейной регрессией

In [10]:
np.mean(cross_val_score(
    LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'
))

-10857.849164936237

При линейной зависимости между признаками и таргетом LinearRegression показал себя лучше, чем DT(внезапно)

## Нелинейная зависимость

In [11]:
X_data, y_data = make_friedman1(n_samples=1000, noise=10, n_features=10)

Меняем максимальную глубину дерева

In [12]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=1), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-121.87487818643179

In [13]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=5), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-140.46583737335703

In [14]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-185.39720877656157

Меняем минимальное количество примеров в листе

In [15]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=2), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-210.5560581142926

In [16]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-150.97808736958015

In [17]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=20), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-132.8776906982689

Подберем оптимальные параметры

In [18]:
%%time
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        'max_depth': range(1, 21, 3),
        'min_samples_leaf': range(1, 21, 3),
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print(gs.best_params_)
print(gs.best_score_)



{'criterion': 'mse', 'max_depth': 1, 'min_samples_leaf': 1}
-120.71762476414649
CPU times: user 7.28 s, sys: 54.9 ms, total: 7.33 s
Wall time: 7.47 s




Сравним с линейной регрессией

In [19]:
np.mean(cross_val_score(
    LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'
))

-108.37254117247532

При нелинейной зависимости между признаками и таргетом LinearRegression сравним с DT

## Оценка времени работы

In [20]:
X_data, y_data = make_regression(n_samples=100000, noise=1000, n_features=30, random_state=42)

In [21]:
%%time
DecisionTreeRegressor(max_depth=1).fit(X_data, y_data)

CPU times: user 554 ms, sys: 16.5 ms, total: 570 ms
Wall time: 577 ms


DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [22]:
%%time
DecisionTreeRegressor(max_depth=2).fit(X_data, y_data)

CPU times: user 982 ms, sys: 11.5 ms, total: 993 ms
Wall time: 1.03 s


DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [23]:
%%time
DecisionTreeRegressor(max_depth=4).fit(X_data, y_data)

CPU times: user 1.83 s, sys: 16.4 ms, total: 1.85 s
Wall time: 1.88 s


DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [24]:
%%time
DecisionTreeRegressor(max_depth=10).fit(X_data, y_data)

CPU times: user 3.76 s, sys: 25.7 ms, total: 3.78 s
Wall time: 3.84 s


DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [25]:
%%time
LinearRegression().fit(X_data, y_data)

CPU times: user 129 ms, sys: 19.6 ms, total: 149 ms
Wall time: 115 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

# Преимущества и Недостатки решающих деревьев:

**Преимущества**
 * хорошо интерпретируются
 * легко обобщаются для регрессии и классификации
 * допускаются разнотипные данные
 
**Недостатки**
 * Сравнение с линейными алгоритмами на линейно разделимой выборке - фиаско
 * Переобучение
 * Неустойчивость к шуму, составу выборки, критерию
 
**Способы устранения недостатков**
 * прунинг (усечение)
 * композиции (леса) деревьев

#### Pruning

<img src='img/pruning.png' Width=800>