In [8]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

In [9]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

In [10]:
df_fan_features = pd.read_csv("Features/df_fan_feature.csv")
df_fan_target = pd.read_csv("Features/df_fan_target.csv")

In [11]:
X = df_fan_features
y = df_fan_target.values.ravel()

In [12]:
# Lets split the data into 5 folds. 
# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_val_score() method
# The folds are made by preserving the percentage of samples for each class.
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt+=1
    
# Note that: 
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting startergy if we just specify value of number of folds. 
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

Fold:1, Train set: 13320, Test set:3330
Fold:2, Train set: 13320, Test set:3330
Fold:3, Train set: 13320, Test set:3330
Fold:4, Train set: 13320, Test set:3330
Fold:5, Train set: 13320, Test set:3330


In [13]:
score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.93633634 0.94054054 0.92942943 0.93783784 0.93393393]
Average score: 0.94


In [14]:
n_estimators = [50, 100, 150, 200, 250, 300, 350]

for val in n_estimators:
    score = cross_val_score(ensemble.RandomForestClassifier(n_estimators= val, random_state= 42), X, y, cv= kf, scoring="accuracy")
    print(f'Average score({val}): {"{:.3f}".format(score.mean())}')

Average score(50): 0.935
Average score(100): 0.936
Average score(150): 0.936
Average score(200): 0.936
Average score(250): 0.936
Average score(300): 0.937
Average score(350): 0.936
