# Random Forest

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rd
import time

from collections import defaultdict 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestClassifier


# Du precedent projet
import randomforest as rf 

## 1. Importation des données

In [11]:
data = pd.read_csv("data.csv")

In [12]:
print(data.shape)
data.head()

(1000, 32)


Unnamed: 0,filename,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,spectral_contrast,spectral_flatness,tonnetz,estimated_tempo,genre/label
0,country.00094,0.366838,0.206715,1474.849928,1745.839794,3108.264538,0.062993,-104.34503,136.39078,-20.945368,...,2.197093,-7.10939,1.849674,1.675598,-1.492039,23.567712,0.001825,0.026578,112.347147,country
1,country.00025,0.347253,0.07992,1565.431223,2016.069774,3188.930717,0.057303,-200.55273,119.6749,-3.610737,...,-11.293138,-8.870809,-8.073557,-3.161044,0.330751,19.845071,0.008667,0.018447,143.554688,country
2,country.00076,0.297332,0.128385,1321.679067,1409.586676,2590.39267,0.066525,-183.93301,159.80644,-23.158834,...,-3.583324,-9.062474,-3.159365,-4.068544,-7.052489,23.526738,0.00122,-0.0139,143.554688,country
3,country.00030,0.22139,0.079631,1240.515214,1996.754074,2412.635411,0.042844,-277.08127,128.25803,42.43324,...,1.391878,3.08001,2.51326,-1.051766,-2.753359,22.093318,0.002381,-0.009946,161.499023,country
4,country.00089,0.322114,0.104638,1321.678546,1667.211777,2583.926042,0.052503,-205.12328,140.99438,-8.959963,...,3.140506,-4.527332,2.654261,-3.122376,-10.710899,24.18941,0.00185,0.0029,143.554688,country


In [13]:
# On supprime la premiere colonne
data = data.drop(['filename'],axis=1)

# On change les noms des genres par des entiers (de 0 a 9) car notre random forest ne prend en compte que des entiers comme labels
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
labels = encoder.fit_transform(genre_list)
data.iloc[:, -1] = labels

In [14]:
# On normalise le dataset
scaler = StandardScaler()
data_normalized = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

## 2. Apprentissage du modèle de base
On reprend le modèle que nous avions implementé lors du précédent projet.

In [15]:
# On separe le dataset en train set et test set (80%/20%)
data_train, data_test, label_train, label_test = train_test_split(data_normalized, labels, test_size=0.2)
print("data_train {0} | label_train {1}".format(data_train.shape, label_train.shape))
print("data_test  {0} | label_test  {1}".format(data_test.shape, label_test.shape))

data_train (800, 30) | label_train (800,)
data_test  (200, 30) | label_test  (200,)


In [16]:
# Initialisation de la random forest
# par defaut n_trees = 200, n_samples = 100, n_cuts = 20, max_depth = 20
rf_classifier = rf.OurRandomForestClassifier() 

# Entrainement du modèle de base
start = time.time()
rf_classifier.fit(data_train, label_train)
end = time.time()
print("Execution time for building the forest: %f sec"%(float(end) - float(start)))

# Test de validation
our_predictions = [rf_classifier.predict(data_test[i,:]) for i in range(data_test.shape[0])] 

Execution time for building the forest: 22.868670 sec


In [17]:
# Comparaison avec sklearn
sklearn_rf = RandomForestClassifier(n_estimators=100, max_depth=20, max_features='sqrt')
sklearn_rf.fit(data_train, label_train)
sklearn_predictions = sklearn_rf.predict(data_test)

In [18]:
print("Our random forest score : {} %".format(rf_classifier.score(our_predictions, label_test) * 100))  
print("Sklearn score : {} %".format(rf_classifier.score(sklearn_predictions, label_test)*100))          

Our random forest score : 66.0 %
Sklearn score : 74.0 %


Il y a une très grande marge d'amélioration.

## 3. Feature selection
Un moyen d'améliorer notre modèle est de sélectionner les caractéristiques les plus discriminantes. 

* https://towardsdatascience.com/de-coding-random-forests-82d4dcbb91a1
* https://hub.packtpub.com/4-ways-implement-feature-selection-python-machine-learning/

In [19]:
print("Shape of the dataset ", data_train.shape)
print("Size of the dataset before feature selection: %.2f MB"%(data_train.nbytes/1e6))
features_name = data.drop(['genre/label'], axis=1).columns

Shape of the dataset  (800, 30)
Size of the dataset before feature selection: 0.19 MB


In [20]:
start = time.time()
features, importances = rf_classifier.findFeatureImportance(data_train, label_train)
end = time.time()
print("Execution time to find the most important feature: %f sec"%(float(end) - float(start)))

Execution time to find the most important feature: 49.854136 sec


In [21]:
feature_importances = pd.DataFrame(zip(features_name, importances), columns = ['feature','importance']).sort_values('importance', ascending=False)

feature_importances[:10]

Unnamed: 0,feature,importance
26,spectral_contrast,0.045
0,chroma_stft,0.04
14,mfcc9,0.02125
9,mfcc4,0.02125
22,mfcc17,0.01375
18,mfcc13,0.0125
17,mfcc12,0.01125
24,mfcc19,0.01125
16,mfcc11,0.01125
2,spectral_centroid,0.00875


In [22]:
# On selectionne les 25 meilleures
indexes = feature_importances.index[:25]

# On transforme le dataset d'entrainement (fs = feature selection)
fs_data_train = rf.transform(data_train, indexes)
fs_data_test = rf.transform(data_test, indexes)
print("Shape of the dataset ", fs_data_train.shape)
print("Size of the dataset after feature selection: %.2f MB"%(fs_data_train.nbytes/1e6))

Shape of the dataset  (800, 25)
Size of the dataset after feature selection: 0.16 MB


In [26]:
# Entrainement du modèle de base avec feature selection
start = time.time()
rf_classifier.fit(fs_data_train, label_train)
end = time.time()
print("Execution time for building the forest: %f sec"%(float(end) - float(start)))

# Test de validation
our_predictions = [rf_classifier.predict(fs_data_test[i,:]) for i in range(data_test.shape[0])] 

Execution time for building the forest: 16.504025 sec


In [24]:
# Comparaison avec sklearn
sklearn_rf.fit(fs_data_train, label_train)
sklearn_predictions = sklearn_rf.predict(fs_data_test)

In [27]:
# Score
print("Our random forest score after feature selection: {} %".format(rf_classifier.score(our_predictions, label_test) * 100))
print("Sklearn score : {} %".format(rf_classifier.score(sklearn_predictions, label_test)*100))   

Our random forest score after feature selection: 59.5 %
Sklearn score : 75.0 %


## 4. Réglage des hyperparamètres
Nous nous sommes fortement inspiré de la méthode décrite dans cet article [W. Koehrsen. Hyperparameter Tuning the Random Forest in Python, Janv. 2018](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74)

Notre modèle a cinq paramètres, dont quatre que nous souhaitons optimiser  :
   - `n_trees` -- le nombre d'arbres de la forêt
   - `n_samples` -- le nombre de données à placer dans le noeud de chaque arbre avant qu'il ne soit partitionné
   - `n_cuts` -- le nombre de coupes à tester pour trouver la meilleure
   - `max_depth` -- la profondeur maximale de chaque arbre

Pour avoir une première idée de la meilleure combinaison d'hyperparamètres, nous allons effectuer une ... (Random Search Cross Validation). Cela consiste à tester un large choix de combinaisons qui ont été formées en tirant aléatoirement des valeurs dans une grille d'hyperparamètres.

### 2.1 Random Search Cross Validation

#### Random Hyperparameter Grid

On définit la grille pour la recherche aléatoire (Random Hyperparameter Grid) :

In [None]:
# n_trees
n_trees = [int(x) for x in np.linspace(start = 200, stop = 800, num = 10)] 

# n_samples
n_samples = [int(x) for x in np.linspace(start = 200, stop = 700, num = 10)]

# n_cuts 
# dans quel intervalle pourrait on tester ??
n_cuts = [int(x) for x in np.linspace(start = 10, stop = 100, num = 5)] 

# max_depth
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
# Est-ce qu'on ajoute un None comme dans l'article ? A voir selon notre random forest, est-ce qu'elle prend en compte
# un arg None pour max_depth ?

# Creation de la grille
random_grid = {'n_trees': n_trees,
                'n_samples': n_samples,
                'n_cuts': n_cuts,
                'max_depth': max_depth}

from pprint import pprint
print("Grille d'hyperparametres :\n")
pprint(random_grid)

Au lieu de tester 10 * 10 * 5 * 10 = 5000 combinaisons d'hyperparamètres, nous allons seulement en sélectionner quelques unes aléatoirement.

#### Random Search Training

On construit tout d'abord le modèle de base.

In [None]:
# On separe le dataset en train set et test set (80%/20%)
data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.2)

In [None]:
our_rf = rf.OurRandomForestClassifier() # choix aléatoire
our_rf.get_params().keys()

On procède à la recherche randomisée sur 50 combinaisons, en utilisant une 3-fold CV

In [None]:
# NE PAS RUN (CA PREND PLUS D'1/4  D'HEURE)
# Definition de la recherche randomisee
rf_random = RandomizedSearchCV(estimator = our_rf, 
                               param_distributions = random_grid, 
                               n_iter = 5, 
                               cv = 3, 
                               verbose = 2, 
                               random_state = 8)
# je dois encore comprendre les differents parametres

# Entrainement du modele
rf_random.fit(data_train, label_train)

Les resultats

In [None]:
print("La meilleure combinaison d'hyperparametres avec la recherche randomisee est :")
print(rf_random.best_params_)
print("")
print("Le score moyen du modele avec ces hyperparametres est :")
print(rf_random.best_score_)

### 2.2 Grid Search Cross Validation
Une fois qu'on connait a peu pres les meilleurs hyper-parametres
Plus d'aleatoire, on teste toutes les combinaisons