# Random Forest

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rd
from collections import defaultdict 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Du precedent projet
import randomforest as rf 

## 1. Importation des données

In [2]:
data = pd.read_csv("data.csv")

In [3]:
print(data.shape)
data.head()

(1000, 29)


Unnamed: 0,filename,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,estimated_tempo,genre/label
0,country.00094,0.366838,0.206715,1474.849928,1745.839794,3108.264538,0.062993,-104.34503,136.39078,-20.945368,...,-6.495045,0.03738,-2.03518,2.197093,-7.10939,1.849674,1.675598,-1.492039,112.347147,country
1,country.00025,0.347253,0.07992,1565.431223,2016.069774,3188.930717,0.057303,-200.55273,119.6749,-3.610737,...,-6.048296,-0.800009,-4.699653,-11.293138,-8.870809,-8.073557,-3.161044,0.330751,143.554688,country
2,country.00076,0.297332,0.128385,1321.679067,1409.586676,2590.39267,0.066525,-183.93301,159.80644,-23.158834,...,-7.383808,0.604945,-3.383083,-3.583324,-9.062474,-3.159365,-4.068544,-7.052489,143.554688,country
3,country.00030,0.22139,0.079631,1240.515214,1996.754074,2412.635411,0.042844,-277.08127,128.25803,42.43324,...,-7.227779,-7.456974,-1.575342,1.391878,3.08001,2.51326,-1.051766,-2.753359,161.499023,country
4,country.00089,0.322114,0.104638,1321.678546,1667.211777,2583.926042,0.052503,-205.12328,140.99438,-8.959963,...,-2.352035,4.481439,-5.014471,3.140506,-4.527332,2.654261,-3.122376,-10.710899,143.554688,country


In [4]:
# On supprime la premiere colonne
data = data.drop(['filename'],axis=1)

# On change les noms des genres par des entiers (de 0 a 9) car notre random forest ne prend en compte que des entiers comme labels
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
labels = encoder.fit_transform(genre_list)
data.iloc[:, -1] = labels

In [5]:
# On normalise le dataset
scaler = StandardScaler()
data = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

## 2. Réglage des hyperparamètres

## 3. Apprentissage
On reprend le modèle que nous avions implementé lors du précédent projet.

In [6]:
# On separe le dataset en train set et test set (80%/20%)
data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.2)

In [7]:
# On convertit data_train en un dictionnaire pour pouvoir le traiter
# les clés étant les labels representant respectivement un genre de musique.

data_train_dict = defaultdict(list)
for k in range(data_train.shape[0]):
    data_train_dict[label_train[k]].append(data_train[k])

In [8]:
root = rf.Node(data_train_dict)
list_trees = rf.randomForest(root.data, 500, 100, 10)
guessed_label = rf.forestClassify(list_trees, data_test[1,:], len(labels))
guessed_genre = encoder.inverse_transform(np.array([guessed_label]))
print("La musique caracterisee par {0} a ete categorisee en {1}".format(data_test[1,:], guessed_genre))

La musique caracterisee par [-1.85953021 -1.69003906 -2.3074728  -2.21758478 -2.61248048 -1.63474084
 -3.21873231  2.6348123   3.38495378 -1.84804673  0.88189596 -1.53958536
  0.46526103 -1.90564152  0.16891031 -2.59132852  0.11904781 -1.49603276
  0.86924647 -1.45508557  0.3402476  -1.0774451   0.70750102 -1.08027028
  0.11195597 -0.39078592 -1.03872078] a ete categorisee en ['classical']


Essayons maintenant sur l'ensemble des données de `data_train`

In [9]:
guesses = [rf.forestClassify(list_trees, data_test[i,:], len(labels)) for i in range(data_test.shape[0])]

print("Our function made good predictions at ", rf.score(guesses, label_test)*100, "% rate")

sklearn_rf = RandomForestClassifier(n_estimators=15, max_depth=5, max_features='sqrt')
sklearn_rf.fit(data_train, label_train)
guessess = sklearn_rf.predict(data_test)

print("Sklearn function made good predictions at ", rf.score(guessess, label_test)*100,"% rate")

Our function made good predictions at  61.5 % rate
Sklearn function made good predictions at  59.5 % rate
