## Imports

In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt, seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, \
ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import IPython.display as ipd
import os
import zipfile
plt.rcParams['figure.figsize'] = (10, 3)

## Load Data

In [None]:
# unzip echonest file
file_names = ["echonest", "tracks"]

for file_name in file_names:
    zip_path = f"../fma-metadata/{file_name}.zip"
    extract_path = "../fma-metadata/"

    if not os.path.exists(extract_path+file_name+".*"):
        # Open the zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract all contents to the specified path
            zip_ref.extractall(extract_path)

In [None]:
# load data from multi header csv 
echonest = pd.read_csv('../fma-metadata\echonest.csv', index_col=0, header=[0,1,2])
tracks = pd.read_csv('../fma-metadata/tracks.csv', index_col=0, header=[0, 1])

echonest.columns=[multicols[-1] for multicols in echonest.columns]
echonest = echonest[["acousticness", "danceability", "energy","instrumentalness","liveness","speechiness","tempo","valence"]]

tracks = tracks["track", "genre_top"]
tracks.rename("genre_top", inplace=True)

df = pd.merge(echonest, tracks, left_index=True, right_index=True)
df["genre_top"].value_counts(dropna=False)

## Data Prep

In [None]:
# count data distribution
top_genres = df.genre_top.value_counts()
top_genres

In [None]:
international_ids = df[df['genre_top']=='International'].index
df = df.drop(index=international_ids)

In [None]:
# encoding
label_encoders = {}
categorical_cols = df.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
df.dtypes

In [None]:
# encode labels and split into training and test
X = df.drop(columns=['genre_top'])

labels = df.loc[:,'genre_top']
cat_y = pd.Categorical(labels)
y = labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# train / test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, shuffle=True, random_state=123)

## Training the model and RandomSearch

In [None]:
# set up randomsearch
classifier = xgb.XGBClassifier()

params = {
 "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
 "max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight": [ 1, 3, 5, 7 ],
 "gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ],
 "booster" : ["dart", "gbtree"],
 "sub_sample" : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
}

In [None]:
rs_model=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='accuracy',n_jobs=-1,cv=5,verbose=3)

In [None]:
#model fitting
rs_model.fit(X_train,y_train)

In [None]:
# select best parameters for the model
rs_model.best_estimator_
model = rs_model.best_estimator_

In [None]:
#fit the best model on the training data
model.fit(X_train, y_train)

## Evaluate the Model

In [None]:
# predict
y_true = y_test.copy()
y_pred = model.predict(X_test)

In [None]:
# some scoring
print('XGB Accuracy: ', accuracy_score(y_true, y_pred))
print('XGB F1: ', f1_score(y_true, y_pred, average='macro'))

In [None]:
# plotting a confusion matrix
fig, axs = plt.subplots(10, figsize=(10,12), sharex=True)
axs[0].set_title('Confusion Matrix (XGBoost)')
axs[9].set_xlabel('Predicted labels')

for i in range(10):    
    sns.heatmap(confusion_matrix(y_true, y_pred)[i].reshape(1,-1), annot=True, cmap='gray_r',
                xticklabels=cat_y.categories, yticklabels=[cat_y.categories[i]], ax=axs[i])
plt.show()

In [None]:
# get the classification report
print("XGB classification report:",'\n')
print(classification_report(y_true, y_pred))
print(dict(zip(cat_y.categories, range(10))))

In [None]:
# get the feature importance
fig, ax = plt.subplots(figsize=(8,12))
from xgboost import plot_importance
plot_importance(model, ax=ax, title='Feature Importance of XGBoost model')
plt.show()