# Chapter 14

## Trees and forests

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
target = iris.target
features = iris.data

tree = DecisionTreeClassifier(random_state=0)
model = tree.fit(features, target)

In [None]:
observations = [[5, 4, 3, 2]]

print(
    f'Class: {model.predict(observations)}',
    f'Probability: {model.predict_proba(observations)}',
    sep = '\n'
)

In [None]:
entropy_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
model_entropy = entropy_tree.fit(features, target)

### 12.2 Training a decision tree regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

boston = datasets.load_boston()
target = boston.target
features = boston.data[:, 0:2]

tree = DecisionTreeRegressor(random_state=0)
model = tree.fit(features, target)

In [None]:
observation = [[0.02, 16]]

model.predict(observation)

In [None]:
mae_tree = DecisionTreeRegressor(criterion="absolute_error", random_state=0)
mae_model = mae_tree.fit(features, target)

### 13.3 Visualizing a decision tree model

In [None]:
import pydotplus
from sklearn.tree import DecisionTreeClassifier 
from sklearn import datasets
from IPython.display import Image
from sklearn import tree
import matplotlib.pyplot as plt

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
clf = DecisionTreeClassifier(random_state=0)
model = clf.fit(features, target)

In [None]:
plt.figure(figsize=(40,20))
tree.plot_tree(
    clf,
    feature_names=iris.feature_names,
    class_names=iris.target_names
)
plt.show()

### 13.4 Training a random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

randomforest = RandomForestClassifier(
    random_state=0, 
    n_jobs=-1
)

model = randomforest.fit(features, target)

In [None]:
observation = [[ 5, 4, 3, 2]]
model.predict(observation)

In [None]:
randomforest_entropy = RandomForestClassifier(
    criterion="entropy", 
    random_state=0
)

model_entropy = randomforest_entropy.fit(features, target)

### 14.6 Identifying important features

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier 
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

randomforest = RandomForestClassifier(random_state=0, n_jobs=-1) 
model = randomforest.fit(features, target) 

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
names = [iris.feature_names[i] for i in indices]

In [None]:
plt.figure(figsize=(20, 7))
plt.title("Feature Importance") 
plt.bar(range(features.shape[1]), importances[indices]) 
plt.xticks(range(features.shape[1]), names, rotation=90) 
plt.show()

### 14.7 Selecting important features of random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn import datasets
from sklearn.feature_selection import SelectFromModel

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
selector = SelectFromModel(randomforest, threshold=0.3)
important_features = selector.fit_transform(features, target) 
model = randomforest.fit(important_features, target)

### 14.8 Handling imbalanced classes

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
features = features[40:,:]
target = target[40:]

target = np.where((target == 0), 0, 1)

randomforest = RandomForestClassifier(
    random_state=0, 
    n_jobs=-1, 
    class_weight="balanced"
)

model = randomforest.fit(features, target)

### 14.9 Controlling size

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
decisiontree = DecisionTreeClassifier(random_state=0,
                                        max_depth=None,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0)

model = decisiontree.fit(features, target)

### 14.10 Improving performance through boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier 
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
adaboost = AdaBoostClassifier(random_state=0)
model = adaboost.fit(features, target)

### 14.11 Evaluating random forests with out-of-bag erros

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
randomforest = RandomForestClassifier(
    random_state=0, 
    n_estimators=1000, 
    oob_score=True, 
    n_jobs=-1
)

model = randomforest.fit(features, target)
randomforest.oob_score_