In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

import seaborn as sns


In [36]:
pokemon = pd.read_csv('pokemon.csv', index_col='#')
pokemon.head()
X = pokemon[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']]
y = pokemon['Legendary']
# Split into train (80%) and test (20%) sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Build unrestricted decision tree
clf = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)
clf.fit(X_train, y_train)

# Predict the labels
pred = clf.predict(X_test)

# Print the confusion matrix
cm = confusion_matrix(y_test, pred)
print('Confusion matrix:\n', cm)

# Print the F1 score
score = f1_score(y_test, pred)
print('F1-Score: {:.3f}'.format(score))

# Build restricted decision tree
clf2 = DecisionTreeClassifier(max_depth=4, max_features=2)
clf2.fit(X_train, y_train)

# Predict the labels
pred2 = clf2.predict(X_test)

# Print the confusion matrix
cm2 = confusion_matrix(y_test, pred2)
print('Confusion matrix:\n', cm2)

# Print the F1 score
score2 = f1_score(y_test, pred2)
print('F1-Score: {:.3f}'.format(score2))

Confusion matrix:
 [[143   7]
 [  3   7]]
F1-Score: 0.583
Confusion matrix:
 [[148   2]
 [  5   5]]
F1-Score: 0.588


In [38]:

# Build unrestricted decision tree
clf = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, 
max_depth=4, max_features=2,
random_state=500)
clf.fit(X_train, y_train)

# Predict the labels
pred = clf.predict(X_test)

# Print the confusion matrix
cm = confusion_matrix(y_test, pred)
print('Confusion matrix:\n', cm)

# Print the F1 score
score = f1_score(y_test, pred)
print('F1-Score: {:.3f}'.format(score))

# Build restricted decision tree
clf2 = DecisionTreeClassifier(max_depth=4, max_features=2)
clf2.fit(X_train, y_train)

# Predict the labels
pred2 = clf2.predict(X_test)

# Print the confusion matrix
cm2 = confusion_matrix(y_test, pred2)
print('Confusion matrix:\n', cm2)

# Print the F1 score
score2 = f1_score(y_test, pred2)
print('F1-Score: {:.3f}'.format(score2))

Confusion matrix:
 [[146   4]
 [  5   5]]
F1-Score: 0.526
Confusion matrix:
 [[145   5]
 [  6   4]]
F1-Score: 0.421


"Weak" decision tree
In the previous exercise you built two decision trees. Which one is fine-tuned and which one is "weak"?

Decision tree "A":

min_samples_leaf = 3 and min_samples_split = 9
F1-Score: ~58%
Decision tree "B":

max_depth = 4 and max_features = 2
F1-Score: ~53%
Both classifiers are available for you as clf_A and clf_B.

Correct choice! Model A is a fine-tuned decision tree, with a decent performance on its own. Model B is 'weak', restricted in height and with performance just above 50%.

In [39]:
pokemon = pd.read_csv('pokemon.csv', index_col='#')
pokemon.head()
X = pokemon[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']]
y = pokemon['Legendary']
# Split into train (80%) and test (20%) sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Take a sample with replacement
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_train_sample = X_train.sample(frac=1.0, replace=True, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index].reset_index(drop=True)
(X_train.shape,y_train.shape),(X_train_sample.shape,y_train_sample.shape)

(((640, 7), (640,)), ((640, 7), (640,)))

In [40]:
X_train_sample

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
102,45,22,60,27,30,29,6
435,35,35,40,35,55,50,2
270,95,117,184,44,46,28,6
106,70,90,70,60,60,70,3
71,40,55,30,30,30,85,3
...,...,...,...,...,...,...,...
404,28,25,25,45,35,40,3
517,50,95,180,85,45,70,1
475,70,77,60,97,60,108,5
436,53,51,53,61,56,40,4


In [41]:
print(X_train_sample.shape)
print(y_train_sample.shape)
print(y_train_sample.value_counts())

(640, 7)
(640,)
Legendary
False    580
True      60
Name: count, dtype: int64


In [42]:
clf = DecisionTreeClassifier(max_depth=4, random_state=500)

# Take a sample with replacement
X_train_sample = X_train.sample(frac=1.0, replace=True, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

# Fit the model to the training sample
clf.fit(X_train_sample, y_train_sample)

In [None]:
# def predict_voting(classifiers, X):
# 	# Make the individual predictions
# 	pred_list = [clf.predict(X) for clf in classifiers]
	
# 	# Combine the predictions using "Voting"
# 	pred_vote = []
# 	for i in range(X.shape[0]):
# 		individual_preds = np.array([pred[i] for pred in pred_list])
# 		combined_pred = stats.mode(individual_preds)[0][0]
# 		pred_vote.insert(i, combined_pred)
	
# 	return pred_vote

In [45]:
from sklearn.metrics import accuracy_score
from scipy import stats
def predict_voting(classifiers, X):
    pred_list = []
    for clf in classifiers:
        pred_list.append(clf.predict(X))
        
    # Combine the predictions using "Voting"
    pred_vote = []
    for i in range(X.shape[0]):
        individual_preds = np.array([pred[i] for pred in pred_list])
        unique, counts = np.unique(individual_preds, return_counts=True)
        combined_pred = unique[np.argmax(counts)]
        pred_vote.append(combined_pred)

    return pred_vote
def build_decision_tree(X_train, y_train, random_state=None):
	# Take a sample with replacement
	X_train_sample = X_train.sample(frac=1.0, replace=True, random_state=random_state)
	y_train_sample = y_train.loc[X_train_sample.index]

	# Build a "weak" Decision Tree classifier
	clf = DecisionTreeClassifier(max_depth=4, random_state=500)

	# Fit the model on the training sample
	clf.fit(X_train_sample, y_train_sample)
	
	return clf

In [46]:
# Build the list of individual models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

clf_list = []
for i in range(21):
    weak_dt = build_decision_tree(X_train, y_train, random_state=i)
    clf_list.append(weak_dt)

# Predict on the test set
pred = predict_voting(clf_list, X_test)

# Print the F1 score
print('F1 score: {:.3f}'.format(f1_score(y_test, pred)))

F1 score: 0.632


In [47]:
# Instantiate the base model
clf_dt = DecisionTreeClassifier(max_depth=4)

# Build and train the Bagging classifier
clf_bag = BaggingClassifier(
  clf_dt,
  n_estimators=21,
  random_state=500)
clf_bag.fit(X_train, y_train)

# Predict the labels of the test set
pred = clf_bag.predict(X_test)

# Show the F1-score
print('F1-Score: {:.3f}'.format(f1_score(y_test, pred)))

F1-Score: 0.667


In [51]:
# Build and train the bagging classifier
clf_bag = BaggingClassifier(
  estimator=clf_dt,
  n_estimators=21,
  oob_score=True,
  random_state=500)
clf_bag.fit(X_train, y_train)

# Print the classifier's out-of-bag score
print(clf_bag.oob_score_)

0.9328125


In [53]:
# Print the out-of-bag score
print('OOB-Score: {:.3f}'.format(clf_bag.oob_score_))

# Evaluate the performance on the test set to compare
pred = clf_bag.predict(X_test)
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, pred)))
# Show the F1-score
print('F1-Score: {:.3f}'.format(f1_score(y_test, pred)))

OOB-Score: 0.933
Accuracy: 0.963
F1-Score: 0.667


In [72]:
uci = pd.read_csv('./uci-secom.csv', index_col='Time', parse_dates=True)

X = uci.drop('Pass/Fail', axis=1)
X = X.fillna(X.median())
y = uci['Pass/Fail']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=500)

In [73]:

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Build a balanced logistic regression
clf_lr = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)

# Build and fit a bagging classifier
clf_bag = BaggingClassifier(clf_lr, oob_score=True, max_features=10, random_state=500)
clf_bag.fit(X_train, y_train)

# Evaluate the accuracy on the test set and show the out-of-bag score
pred = clf_bag.predict(X_test)
print('Accuracy:  {:.2f}'.format(accuracy_score(y_test, pred)))
print('OOB-Score: {:.2f}'.format(clf_bag.oob_score_))

# Print the confusion matrix
print(confusion_matrix(y_test, pred))

Accuracy:  0.77
OOB-Score: 0.68
[[236  60]
 [ 13   5]]


  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


In [75]:
# Build a balanced logistic regression
clf_base = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)

# Build and fit a bagging classifier with custom parameters
clf_bag = BaggingClassifier(estimator=clf_base, n_estimators=20, max_features=10, max_samples=0.65, bootstrap=False, random_state=500)
clf_bag.fit(X_train, y_train)

# Calculate predictions and evaluate the accuracy on the test set
y_pred = clf_bag.predict(X_test)
print('Accuracy:  {:.2f}'.format(accuracy_score(y_test, y_pred)))

# Print the classification report
print(classification_report(y_test, y_pred))


Accuracy:  0.75
              precision    recall  f1-score   support

          -1       0.95      0.78      0.86       296
           1       0.10      0.39      0.15        18

    accuracy                           0.75       314
   macro avg       0.53      0.58      0.51       314
weighted avg       0.91      0.75      0.82       314

