In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import os
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [6]:
df = pd.read_csv('data/preprocessed_data.csv')

X = df.drop(columns=['track_genre', 
                     'track_id', 
                     'track_name', 
                     'artists',
                     'album_name'], 
            axis=1)
X['explicit'] = X['explicit'].astype(int)
y = df['track_genre']

X.head(3)

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode,explicit,time_signature,key
0,0.975633,-0.730859,-0.845908,-1.88998,-1.784744,-0.078993,1.831732,-0.504094,-0.591211,-0.79869,-1.489717,1,0,4,1
1,1.065299,-0.160332,-0.742186,-1.122669,-0.293288,-0.273826,-0.315499,-0.504112,-0.507167,-1.365688,-1.528312,1,0,4,0
2,1.692961,-0.243214,-1.733304,-2.312994,-2.039252,-0.457309,1.774593,-0.503883,-0.428376,-1.276974,1.987859,1,0,3,0


In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_estimators_range = [50, 100, 150]
max_depth_range = [10, 20, 30]
criterion_range = ['gini', 'entropy']


In [10]:
RF = RandomForestClassifier()

best_score = 0
best_params = {'n_estimators': None, 'max_depth': None, 'criterion': None}

# Perform hyperparameter optimization using four-fold cross-validation on the training set
for n_estimators in n_estimators_range:
    for max_depth in max_depth_range:
        for criterion in criterion_range:    
            # Create the Random Forest model with the current hyperparameters
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion, random_state=42)

            # Perform four-fold cross-validation on the training data and compute the mean score
            scores = cross_val_score(model, X_train, y_train, cv=4)
            mean_score = scores.mean()

            # Print the current hyperparameters and their corresponding mean score
            print(f'n_estimators: {n_estimators}, max_depth: {max_depth}, criterion: {criterion},\n mean_score: {mean_score}')

            # Update the best parameters and score if the current score is better
            if mean_score > best_score:
                best_score = mean_score
                best_params['n_estimators'], best_params['max_depth'], best_params['criterion'] = n_estimators, max_depth, criterion
    # Print the best parameters and the best score
print(f'Best hyperparameters: {best_params}')
print(f'Best score: {best_score}')

# Train the final model with the best hyperparameters on the entire training set
final_model = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                                     max_depth=best_params['max_depth'],
                                     criterion=best_params['criterion'],
                                     random_state=42)
final_model.fit(X_train, y_train)

# Evaluate the final model on the test set
y_pred = final_model.predict(X_test)

test_score = accuracy_score(X_test, y_pred)

print(f'Test set score: {test_score}')


n_estimators: 50, max_depth: 10, criterion: gini,
 mean_score: 0.29844990453220777
n_estimators: 50, max_depth: 10, criterion: entropy,
 mean_score: 0.3056099889458346
n_estimators: 50, max_depth: 20, criterion: gini,
 mean_score: 0.3282082202793689
