# Supervised Machine Learning Models for Music Genre Classification

In [58]:
import pandas as pd

df = pd.read_csv('Data/features_3_sec.csv')

### Preprocess Data

In [59]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Selecting target column as music genre (already named label) 
X = df.drop(columns=['label', 'filename'])
y = df['label']

# Standardize 
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [60]:
from sklearn.model_selection import train_test_split
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=random_state)

### Initialize Models

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

models = {
    'Random Forest': RandomForestClassifier(n_estimators=1000, random_state=random_state),
    'SVM': SVC(random_state=random_state),
    'KNN': KNeighborsClassifier(n_neighbors=10),
    'Decision Tree': DecisionTreeClassifier(random_state=random_state),
    'XGB': XGBClassifier(random_state=random_state)
}

In [93]:
from sklearn.metrics import classification_report, accuracy_score

# Method to briefly assess models based primarily on accuracy
def model_assess(model):
    # Train and make predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(model)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

### Assess Models

In [94]:
for model_name, model in models.items():
    print(f'Evaluating model: {model_name}')
    model_assess(model)

Evaluating model: Random Forest
RandomForestClassifier(n_estimators=1000, random_state=42)
Accuracy: 0.8909
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89       208
           1       0.92      0.99      0.95       203
           2       0.78      0.84      0.81       186
           3       0.88      0.86      0.87       199
           4       0.95      0.89      0.91       218
           5       0.88      0.91      0.90       192
           6       0.90      0.96      0.93       204
           7       0.92      0.94      0.93       180
           8       0.89      0.89      0.89       211
           9       0.90      0.75      0.82       197

    accuracy                           0.89      1998
   macro avg       0.89      0.89      0.89      1998
weighted avg       0.89      0.89      0.89      1998

Evaluating model: SVM
SVC(random_state=42)
Accuracy: 0.7588
Classification Report:
              precision    

### Tune Hyperparameters of Best Model

In [89]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Best model was XGB
param_grid = {
    'n_estimators': np.arange(50, 301, 50),
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': np.arange(3, 21, 2),
    'min_child_weight': np.arange(1, 21, 2),
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=XGBClassifier(random_state=random_state), param_distributions=param_grid, 
                                   n_iter=100, cv=3, verbose=2, random_state=random_state, n_jobs=-1)
random_search.fit(X_train, y_train)
best_params_random = random_search.best_params_
print(f"Best parameters found by RandomizedSearchCV: {best_params_random}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found by RandomizedSearchCV: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 250, 'min_child_weight': 5, 'max_depth': 13, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}


In [91]:
from sklearn.model_selection import GridSearchCV

# Define refined parameter grid for GridSearchCV based on RandomizedSearchCV results
param_grid_refined = {
    'n_estimators': [best_params_random['n_estimators']],
    'learning_rate': [best_params_random['learning_rate']],
    'max_depth': [best_params_random['max_depth'] - 1, best_params_random['max_depth'], best_params_random['max_depth'] + 1],
    'min_child_weight': [best_params_random['min_child_weight'] - 1, best_params_random['min_child_weight'], best_params_random['min_child_weight'] + 1],
    'subsample': [best_params_random['subsample']],
    'colsample_bytree': [best_params_random['colsample_bytree']],
    'gamma': [best_params_random['gamma']],
    'reg_alpha': [best_params_random['reg_alpha']],
    'reg_lambda': [best_params_random['reg_lambda']],
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=XGBClassifier(random_state=random_state), param_grid=param_grid_refined, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params_grid = grid_search.best_params_
print(f"Best parameters found by GridSearchCV: {best_params_grid}")

# Best estimator from GridSearchCV
best_xgb_model = grid_search.best_estimator_


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters found by GridSearchCV: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 13, 'min_child_weight': 4, 'n_estimators': 250, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 1.0}


### Evaluate Tuned Model

In [95]:
xgb = XGBClassifier(**best_params_grid)
model_assess(xgb)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=13, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=250, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Accuracy: 0.9134
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90       208
           1       0.92      0.99      0.95       203
           2       0.84      0.90      0.

In [None]:
def complete_model_evaluation(): 
    pass