In [2]:
# Import pertinent ML functions
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score

# Import the Models
from sklearn.ensemble import RandomForestClassifier

# Import other important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import loguniform
import pickle

gtzan_feature_file = '../Data/gtzan_features.csv'

def load_gtzan_set(filename):
  df = pd.read_csv(filename)
  X, y = np.split(df.to_numpy(),[-1],axis=1)
  X = X.astype(np.float64)
  y = y.T[0]
  return X, y, df.columns.values

rand_forest_hp = {
  'n_estimators': [50, 100, 150],
  'criterion': ['gini', 'entropy'],
  'max_depth': [5, 10, 25, None],
  'max_features': ['sqrt', 'log2', None]
}

scaling_to_test = [None, 'MinMax', 'MaxAbs']

def full_data_experiment(scaling, model, hp_space):
  # Configure the outer cross-validation procedure
  outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
  outer_results = {'scaling': scaling, 'acc': [], 'best_params': []}

  iter = 1

  for train_ix, test_ix in outer_cv.split(gtzan_X):
    print(iter)
    # Split data
    X_train, X_test = gtzan_X[train_ix, :], gtzan_X[test_ix, :]
    y_train, y_test = gtzan_y[train_ix], gtzan_y[test_ix]

    # Scale the input data if it applies
    if scaling is not None:
      scaler = MinMaxScaler() if scaling == 'MinMax' else MaxAbsScaler()
      X_train = scaler.fit_transform(X_train)
      X_test = scaler.transform(X_test)

    # Configure the inner cross-validation procedure
    cv_inner = KFold(n_splits=4, shuffle=True, random_state=1)

    # Define the Random Search & refit best model on whole training set
    search = GridSearchCV(model, hp_space, scoring='accuracy', cv=cv_inner, verbose=True, refit=True, n_jobs=-1)
 
    # Execute the Random Search with the PCA-transformed train set
    result = search.fit(X_train, y_train)

    # Get the best performing model fit on the whole training set
    best_model = result.best_estimator_

    # Evaluate model on the PCA-transformed test set
    preds = best_model.predict(X_test)
 
    # Evaluate the model
    acc = accuracy_score(y_test, preds)
 
    # Store the results
    outer_results['acc'].append(acc)
    outer_results['best_params'].append(result.best_params_)

    iter += 1

  return outer_results

In [5]:
gtzan_X, gtzan_y, gtzan_feature_list = load_gtzan_set(gtzan_feature_file)

In [6]:
rand_forest_results = []
base_rand_forest_model = RandomForestClassifier()

In [7]:
for scaling in scaling_to_test:
  print(str(scaling))
  results = full_data_experiment(scaling=scaling, model=base_rand_forest_model, hp_space=rand_forest_hp)
  rand_forest_results.append(results)

None
1
Fitting 4 folds for each of 72 candidates, totalling 288 fits
2
Fitting 4 folds for each of 72 candidates, totalling 288 fits
3
Fitting 4 folds for each of 72 candidates, totalling 288 fits
4
Fitting 4 folds for each of 72 candidates, totalling 288 fits
5
Fitting 4 folds for each of 72 candidates, totalling 288 fits
6
Fitting 4 folds for each of 72 candidates, totalling 288 fits
7
Fitting 4 folds for each of 72 candidates, totalling 288 fits
8
Fitting 4 folds for each of 72 candidates, totalling 288 fits
9
Fitting 4 folds for each of 72 candidates, totalling 288 fits
10
Fitting 4 folds for each of 72 candidates, totalling 288 fits
MinMax
1
Fitting 4 folds for each of 72 candidates, totalling 288 fits
2
Fitting 4 folds for each of 72 candidates, totalling 288 fits
3
Fitting 4 folds for each of 72 candidates, totalling 288 fits
4
Fitting 4 folds for each of 72 candidates, totalling 288 fits
5
Fitting 4 folds for each of 72 candidates, totalling 288 fits
6
Fitting 4 folds for each 

In [8]:
print('Done')

Done


In [11]:
rand_forest_results[2]

{'scaling': 'MaxAbs',
 'acc': [0.7833333333333333,
  0.81,
  0.8433333333333334,
  0.8366666666666667,
  0.8466666666666667,
  0.85,
  0.8433333333333334,
  0.8366666666666667,
  0.84,
  0.8333333333333334],
 'best_params': [{'criterion': 'gini',
   'max_depth': 25,
   'max_features': 'sqrt',
   'n_estimators': 150},
  {'criterion': 'entropy',
   'max_depth': 25,
   'max_features': 'sqrt',
   'n_estimators': 100},
  {'criterion': 'gini',
   'max_depth': None,
   'max_features': 'log2',
   'n_estimators': 150},
  {'criterion': 'gini',
   'max_depth': None,
   'max_features': 'sqrt',
   'n_estimators': 150},
  {'criterion': 'gini',
   'max_depth': 25,
   'max_features': 'sqrt',
   'n_estimators': 150},
  {'criterion': 'entropy',
   'max_depth': None,
   'max_features': 'sqrt',
   'n_estimators': 150},
  {'criterion': 'gini',
   'max_depth': 25,
   'max_features': 'log2',
   'n_estimators': 150},
  {'criterion': 'gini',
   'max_depth': 25,
   'max_features': 'log2',
   'n_estimators': 150