In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import OrdinalEncoder
from skimage.feature import hog

<font  style="font-size: 4rem; color: #1abc9c"> Model Combination : </font> 
<font  style="font-size: 4rem; color: #1abc9c"> Random Forests, Adaboost </font>

In Scikit-learn, the Random Forests methods are implemented via the <code>RandomForestClassifier</code> and <code>RandomForestCRegressor</code> class. Main parameters are:

    n_estimators: the number of trees in the forest.

    max_features : the number of attributes randomly drawn.

    oob_score : boolean. Estimate or not the generalization error OOB (Out of Bag).

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# <font color="#1E90FF">Exercise 1. Random Forest exploration</font>


In this exercise, we will take in hand the implementation of the RF. The dataset is based on "Bank Marketing" UCI dataset. The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).

Input variables:

    1 - age (numeric)
    2 - job : type of job (categorical: "admin.","bluecollar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
    3 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
    4 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
    5 - cons.price.idx: consumer price index - monthly indicator (numeric) 
    6 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
    7 - euribor3m: euribor 3 month rate - daily indicator (numeric)
    8 - nr.employed: number of employees - quarterly indicator (numeric)


## <font color="#9400D3">1. Data processing </font>

In [None]:
# Load data
bank_data = pd.read_csv('./data/bank-additional-modified.csv', sep=';')
bank_data['y'] = bank_data['y'].map({'no':0,'yes':1})
print(pd.value_counts(bank_data.y))
bank_data.head()


<font color="red">**Question 1:**</font> Describe the dataset : number of samples, features, classes, class distribution


Attributes should be numeric. We employ the <code>OrdinalEncoder</code> preprocessing method to transform the categorical attributes.

<font color="red">**Question 2:**</font> Explain the effect of this encoding.

In [None]:
# Data preprocessing
oenc=OrdinalEncoder()
bank_data[["job","education"]] = oenc.fit_transform(bank_data[["job","education"]])
bank_data.head()

## <font color="#9400D3">2. RF classifier </font>

In [None]:
# Dataset split into train/test set
bank_train, bank_test = train_test_split(bank_data, test_size = 0.3, random_state = 50)

# Learn a RF classifier
r_f = RandomForestClassifier(n_estimators=15, random_state=0, oob_score=True)
r_f.fit(bank_train.iloc[:,:-1],bank_train.y) 


# Feature importance
# Create a series containing feature importances from the model and feature names from the training data
feature_importances = pd.Series(r_f.feature_importances_, index=bank_train.columns[:-1]).sort_values(ascending=False)

# Plot a simple bar chart
feature_importances.plot.bar();

<font color="red">**Question 3:**</font> How many trees are used in the ensemble?

<font color="red">**Question 4:**</font> How many features are consider when looking for the best split?

<font color="red">**Question 5:**</font> Recall how the importance of variables is determined.

<font color="blue">**Todo:**</font> Compute the prediction score of this random forest on the train set (function <code>score()</code>) and on the OOB samples (**attribute** <code>oob_score_</code>).

In [None]:
# TODO - compute prediction scores


<font color="red">**Question 6:**</font> 
- What represents the OOB score?
- Compare with the real (generalization) error estimated on the test set.

## <font color="#9400D3">3. Comparison with a decision tree </font>

<font color="blue">**Todo:**</font> Fit a decision tree classifier, and compute the real accuracy.

In [None]:
# TODO - Fit a decision tree classifier, and compute the real accuracy.


# <font color="#1E90FF">Exercise 2. RF on Handwritten digits</font>
Similarly to Decision Trees notebook, we will first apply random forest to the raw images, and then try to improve the performance using HOG representations of images.

In [None]:
mnist = pd.read_csv('./data/cp_sample.csv', sep=';')
mnist.head()

In [None]:
# Dataset split into train/test set
data_train, data_test = train_test_split(mnist, test_size = 0.3, random_state = 4)


## <font color="#9400D3">1. Basic RF</font>

<font color="blue">**Todo:**</font> learn a RandomForest with 10 trees, enabling the use of out-of-bag samples.

In [None]:
# TODO - learn a RandomForest with 10 trees


In [None]:
# TODO - compute RF empirical, OOB and real errors


## <font color="#9400D3">2. Tuning the number of trees</font>

Try now different random forests (by considering different number of trees) and select the most appropriated one.
Use the OOB sample estimates which allows the RandomForestClassifier to be fit and validated whilst being trained.
Plot the OOB **error** as a function of the number of trees.


In [None]:
# TODO
# Ntrees=[...]
# train_scores, oob_scores = np.zeros(len(Ntrees)), np.zeros(len(Ntrees))

#for count,n_tree in enumerate(Ntrees):
 

In [None]:
# TODO - Plot the OOB error as a function of the number of trees.




<font color="red">**Question 1:**</font> 
- What is the best number of trees for this dataset?
- What is the estimation of the generalization error of the selected random forest ?
- Compare with the previous (basic) RF

In [None]:
# TODO - evaluate best RF classifier


## <font color="#9400D3">3. Using HOG features</font>

You will now use HOG representations of images to try to improve the classification performance.

We compute the new training and test set with HOG representations with 8 orientations and cells of $14\times14$ pixels (you can change after).

In [None]:
def my_hog(row, ori, cell):
    return(pd.Series(hog(row.iloc[1:].to_numpy().reshape(28,28,1), orientations=ori, pixels_per_cell=(cell, cell), cells_per_block=(1,1),multichannel=True)))

In [None]:
hog_train = data_train.apply(my_hog, axis=1, args=(8,14))
hog_test = data_test.apply(my_hog, axis=1, args=(8,14))
hog_train['label'] = data_train.label
hog_test['label'] = data_test.label

<font color="blue">**Todo:**</font> Try different random forests (by considering different number of trees) on this new dataset and select the most appropriated one.


<font color="red">**Question 2:**</font> 
What is the estimation of the generalization error of random forest for this dataset ? Conclusion ?

In [None]:
# TODO - select the appropriate RandomForestClassifier on the HOG features


In [None]:
# TODO - Compute the real error



# <font color="#1E90FF">Exercise 3. A small tour to Adaboost</font>

<code>AdaBoostClassifier</code> implements the popular boosting algorithm AdaBoost. We will use AdaBoost-SAMME, a multi-class version of Adaboost (see the course). 


In [None]:
# Load data
df=pd.read_csv('./data/data_exam.txt', sep=' ')
df.head()
dataset = df.rename(columns={df.columns[0]: 'X1',df.columns[1]: 'X2',df.columns[2]: 'Y'})
sns.scatterplot(data=dataset, x='X1', y='X2', hue='Y', marker='+', palette=['blue','red'])

In [None]:
# Learn an AdaBoost classifier
data_train, data_test = train_test_split(dataset, test_size = 0.3, random_state = 6)
aboost = AdaBoostClassifier(n_estimators=5, algorithm="SAMME", random_state=0)
aboost.fit(data_train.loc[:, ['X1', 'X2']], data_train.Y)
aboost.score(data_test.loc[:, ['X1', 'X2']], data_test.Y)

In [None]:
print("Estimator used to grow the ensemble : ", aboost.base_estimator_)
print("Weights for each estimator in the boosted ensemble : ", aboost.estimator_weights_)
print("Classification error for each estimator in the boosted ensemble : ", aboost.estimator_errors_)

<font color="red">**Question 1:**</font> 
- What is the weak classifier used here?
- How many iterations are done?

<font color="red">**Question 2:**</font> 
- Remind how the estimator weights are computed? (Give the formula. Be careful, this is the SAMME version)

<font color="blue">**Todo:**</font> Apply the formula to retrieve the estimator weights (at least one).

In [None]:
# TODO - retrieve the estimator weights



In [None]:
def draw_decision_boundaries(model, data):
    h = 0.05
    xx, yy = np.meshgrid(
            np.linspace(data.iloc[:, 0].min(), data.iloc[:, 0].max()),
            np.linspace(data.iloc[:, 1].min(), data.iloc[:, 1].max()))
    zz = np.c_[xx.ravel(), yy.ravel()]
    zz = pd.DataFrame(zz)
    zz2 = zz
    zz2.columns=['X1','X2'] # avoid warning on valid feature names
    pred_zz= pd.Series(model.predict(zz2))
    color_map = matplotlib.colors.ListedColormap(pd.Series(['blue', 'red']))
    fig = plt.figure(figsize=  (8,8))
    fig = plt.scatter(zz.iloc[:,0], zz.iloc[:,1], c = pred_zz, cmap = color_map, marker='+', s=70)
    fig = plt.scatter(data.iloc[:,0], data.iloc[:,1], s = 50, c = data.iloc[:,2], cmap = color_map)

In [None]:
draw_decision_boundaries(aboost,data_test)

<font color="blue">**Todo:**</font> Compare with a decision tree: performances, decision boundaries

In [None]:
# TODO - compare with decision tree and draw the decision boundaries

