# Random Forest

Como vimos antes, este algoritmo es una combinación de los árboles de decisión que vimos antes. Siempre tener una predicción basada en varias predicciones es mejor.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
spotify = pd.read_csv('data_spotify.csv')
spotify = spotify.iloc[:, 1:]
spotify.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1,Xanny Family,Future
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1,Master Of None,Beach House
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys


In [3]:
X = spotify.iloc[:, :13]
X.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904


In [4]:
y = spotify.target
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# Importamos el algoritmo
from sklearn.ensemble import RandomForestClassifier

In [20]:
clf=RandomForestClassifier(n_estimators=100) # Cuantos árboles queremos que entrene

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 

0.7623762376237624
[[160  40]
 [ 56 148]]
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       200
           1       0.79      0.73      0.76       204

   micro avg       0.76      0.76      0.76       404
   macro avg       0.76      0.76      0.76       404
weighted avg       0.76      0.76      0.76       404



Cuáles son las caractarísticas que más importan para clasificar? Veamos la importancia (recuerden que esto suma 1)

In [22]:
feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

instrumentalness    0.137825
loudness            0.122481
danceability        0.105091
speechiness         0.102608
energy              0.098962
duration_ms         0.087517
acousticness        0.083640
valence             0.077094
tempo               0.073939
liveness            0.059319
key                 0.034393
mode                0.012401
time_signature      0.004731
dtype: float64

### Sí vemos que lo hizo mejor que el decision tree sencillo, e incluso mejor que el SVM que ya había mejorado mucho

In [23]:
clf=RandomForestClassifier(n_estimators=1000) # Subimos a 1000 a ver si mejora. No podemos muchísimos tampoco porque memoria y computo
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [24]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 

0.7772277227722773
[[158  42]
 [ 48 156]]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       200
           1       0.79      0.76      0.78       204

   micro avg       0.78      0.78      0.78       404
   macro avg       0.78      0.78      0.78       404
weighted avg       0.78      0.78      0.78       404



In [25]:
clf=RandomForestClassifier(n_estimators=1000, max_depth = 3)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))

0.6955445544554455
[[144  56]
 [ 67 137]]
              precision    recall  f1-score   support

           0       0.68      0.72      0.70       200
           1       0.71      0.67      0.69       204

   micro avg       0.70      0.70      0.70       404
   macro avg       0.70      0.70      0.70       404
weighted avg       0.70      0.70      0.70       404



In [26]:
clf=RandomForestClassifier(n_estimators=1000, max_depth = 3, max_features = 4)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))

0.7004950495049505
[[143  57]
 [ 64 140]]
              precision    recall  f1-score   support

           0       0.69      0.71      0.70       200
           1       0.71      0.69      0.70       204

   micro avg       0.70      0.70      0.70       404
   macro avg       0.70      0.70      0.70       404
weighted avg       0.70      0.70      0.70       404

