In [14]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

In [15]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

In [16]:
df_features = pd.read_csv("features_df.csv")
df_target = pd.read_csv("target_df.csv")

In [17]:
df_target["normality"] = df_target.iloc[:, 1].replace(('normal', 'abnormal'), (0, 1))

In [18]:
df_target.drop(df_target.columns[0], axis=1 , inplace=True)

In [19]:
df_features.drop(df_features.columns[0], axis=1 , inplace=True)

In [20]:
df_features.shape

(4170, 160)

In [21]:
df_target.shape

(4170, 2)

In [22]:
X = df_features 
y = df_target["normality"]

In [23]:
# Lets split the data into 5 folds. 
# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_val_score() method
# The folds are made by preserving the percentage of samples for each class.
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt+=1
    
# Note that: 
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting startergy if we just specify value of number of folds. 
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

Fold:1, Train set: 3336, Test set:834
Fold:2, Train set: 3336, Test set:834
Fold:3, Train set: 3336, Test set:834
Fold:4, Train set: 3336, Test set:834
Fold:5, Train set: 3336, Test set:834


In [24]:
score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.99280576 0.99160671 0.9940048  0.99040767 0.99160671]
Average score: 0.99


In [25]:
n_estimators = [50, 100, 150, 200, 250, 300, 350]

for val in n_estimators:
    score = cross_val_score(ensemble.RandomForestClassifier(n_estimators= val, random_state= 42), X, y, cv= kf, scoring="accuracy")
    print(f'Average score({val}): {"{:.3f}".format(score.mean())}')

Average score(50): 0.992
Average score(100): 0.992
Average score(150): 0.993
Average score(200): 0.994
Average score(250): 0.994
Average score(300): 0.994
Average score(350): 0.994
