In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the iris dataset
data = load_iris()
# Convert to a DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add the target variable to the DataFrame
df['Species'] = data.target
# Now you can drop the columns and split the data
X = df.drop(['Species'], axis=1)
y = df['Species']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Voting Classifier
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [5]:
voting_clf = VotingClassifier( estimators = [('lr', log_clf), ('rf', rnd_clf), ('svm', svm_clf)], 
                             voting = 'hard')
voting_clf.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred))

LogisticRegression 1.0
RandomForestClassifier 1.0
SVC 1.0
VotingClassifier 1.0


In [11]:
# Baging and Pasting 
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
# Create a BaggingClassifier with the correct argument
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    max_samples=100, bootstrap=True, n_jobs=1
)

# Fit the model
bag_clf.fit(X_train, y_train)

# Make predictions
y_pred = bag_clf.predict(X_val) 

In [15]:
# Out - of - Bag Evaluation 
bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
                           n_estimators=500, bootstrap=True, 
                           n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)

In [16]:
bag_clf.oob_score_

0.95

In [17]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_val)
accuracy_score(y_val, y_pred)

1.0

In [18]:
bag_clf.oob_decision_function_

array([[1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.04046243, 0.95953757],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.02717391, 0.97282609],
       [0.        , 0.94086022, 0.05913978],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.64835165, 0.35164835],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.

In [20]:
# Random forest 
from sklearn.ensemble import RandomForestClassifier

In [22]:
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rnd_clf.fit(X_train, y_train)

In [23]:
y_pred_rf = rnd_clf.predict(X_val)

In [24]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter='random',
                                                  max_leaf_nodes = 16), n_estimators = 500, 
                           max_samples = 1, bootstrap = True, n_jobs=-1)

In [25]:
# Feature Importance
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs= -1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)
    

sepal length (cm) 0.10660734162689624
sepal width (cm) 0.02423844524191984
petal length (cm) 0.4392886032420822
petal width (cm) 0.4298656098891018


In [26]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                            algorithm = 'SAMME.R', learning_rate=0.5)
ada_clf.fit(X_train, y_train)



In [27]:
# Gradient Boosting 
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

In [28]:
# Decission on resudual error
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

In [29]:
# third regressor on the residual errors 
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

In [31]:
# Now we have ensembling three trees 
y_pred = sum(tree.predict(X_val) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [33]:
y_pred

array([ 0.99103926, -0.10155333,  2.        ,  0.99103926,  0.99103926,
       -0.00505885,  0.99103926,  2.00345395,  0.99103926,  0.99103926,
        2.00345395,  0.03703704, -0.10155333,  0.03703704, -0.00505885,
        0.99103926,  2.00345395,  0.99103926,  0.99103926,  2.00345395,
        0.03703704,  1.87670754, -0.00505885,  2.00345395,  2.        ,
        2.00345395,  2.00345395,  2.00345395,  0.03703704,  0.03703704])

In [34]:
# GBRT 
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators=3, learning_rate=1)
gbrt.fit(X, y)

In [38]:
# Early stopping 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)]

best_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

In [39]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping