In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

In [2]:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file="iris_tree.dot",
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [3]:
#Regression with decision tree
#Decision Trees overfit when dealing with regression tasks without any regularization

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

In [5]:
#exercises
#7
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.4)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [7]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_leaf_nodes': list(range(2, 100)), 
    'min_samples_split': [2, 3, 4]
}
gridSearch = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)
gridSearch.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


In [8]:
gridSearch.best_estimator_

In [9]:
from sklearn.metrics import accuracy_score

y_pred = gridSearch.predict(X_test)
accuracy_score(y_pred, y_test)

0.835

In [10]:
from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [12]:
from sklearn.base import clone
import numpy as np

forest = [clone(gridSearch.best_estimator_) for _ in range (n_trees)]
accuracy_scores = []
for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

np.float64(0.795375)

In [13]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [14]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [15]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.835

In [18]:
#EX
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version = 1)

X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

et = ExtraTreesClassifier(n_estimators=100, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svc = SVC(max_iter=100, tol=20, random_state=42)

In [22]:
estimators = [et, rf, svc]

for estimator in estimators:
    estimator.fit(X_train, y_train)

In [23]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9715, 0.9692, 0.0956]

In [29]:
from sklearn.ensemble import VotingClassifier

#sintaxe para adicionar os modelos no Voting Classifier
named_estimators = [
    ('extra_trees', et),
    ('random_forest', rf),
    ('svc', svc)
]

vc = VotingClassifier(named_estimators)

In [32]:
vc.fit(X_train, y_train)

In [33]:
vc.score(X_val, y_val)

0.9706

In [34]:
vc.score(X_test, y_test)

0.9655