# **Music Genre Classification**

## Downloading data

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")

print("Path to dataset files:", path)

## Downloading required packages

In [None]:
!pip install librosa

## Import Libraries

In [None]:
import pandas as pd
import librosa as lb
import numpy as np
import joblib
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

## Load music

In [6]:
path = './Data/genres_original/blues/blues.00000.wav'
y, sr = lb.load(path, sr=22050)
y

array([ 0.00732422,  0.01660156,  0.00762939, ..., -0.05560303,
       -0.06106567, -0.06417847], dtype=float32)

## Extracting features

### 1. MFCC

MFCC breaks a song into n number of small parts. Each small part is then converted into 13 values. Hence, the shape would be (13, n). We then take the average of all n number of columns. This gives a single column to represent whole song's MFCC.

In [7]:
mfcc = lb.feature.mfcc(y=y, sr=sr)
mfcc_mean = np.mean(mfcc, axis = 1)
mfcc_mean

array([-113.59882   ,  121.57067   ,  -19.162262  ,   42.36394   ,
         -6.362266  ,   18.621931  ,  -13.699734  ,   15.339802  ,
        -12.274304  ,   10.970944  ,   -8.326061  ,    8.802088  ,
         -3.6699412 ,    5.744675  ,   -5.162783  ,    0.7517065 ,
         -1.687854  ,   -0.40873003,   -2.3026767 ,    1.2224671 ],
      dtype=float32)

### 2. Chroma

In [8]:
chroma = lb.feature.chroma_stft(y=y, sr=sr)
chroma_mean = np.mean(chroma, axis = 1)
chroma_mean

array([0.36224282, 0.36781707, 0.4398344 , 0.2446918 , 0.247862  ,
       0.3356164 , 0.36461878, 0.43568373, 0.29598564, 0.31508029,
       0.40700835, 0.3851007 ], dtype=float32)

### 3. Spectral contrast

In [9]:
spr = lb.feature.spectral_contrast(y=y, sr = sr)
spr_mean = np.mean(spr, axis = 1)
spr_mean

array([15.98611558, 15.09820328, 18.45414937, 18.36224256, 18.91743082,
       17.18981618, 39.65506588])

### 4. Tonnetz

In [10]:
ton = lb.feature.tonnetz(y=y,sr=sr)
ton_mean = np.mean(ton, axis = 1)
ton_mean

array([ 0.02654771,  0.01822477,  0.00486467, -0.01663463,  0.00977708,
       -0.00421312])

### 5. Zero-Crossing Rate

In [11]:
zcr = lb.feature.zero_crossing_rate(y)
zcr_mean = np.mean(zcr, axis = 1)
zcr_mean

array([0.08304482])

### 6. Spectral Centroid

In [12]:
spc = lb.feature.spectral_centroid(y=y, sr=sr)
spc_mean = np.mean(spc, axis = 1)
spc_mean

array([1784.12264126])

### 7. Spectral Bandwidth

In [13]:
spb = lb.feature.spectral_bandwidth(y=y, sr=sr)
spb_mean = np.mean(spb, axis = 1)
spb_mean

array([2002.41240727])

## Concatenate

Concatenate all features into a single 1D array.

In [14]:
features = np.hstack(
    (mfcc_mean, 
     chroma_mean,
     spr_mean,
     ton_mean,
     zcr_mean,
     spc_mean,
     spb_mean)
)
features

array([-1.13598824e+02,  1.21570671e+02, -1.91622620e+01,  4.23639412e+01,
       -6.36226606e+00,  1.86219311e+01, -1.36997337e+01,  1.53398018e+01,
       -1.22743044e+01,  1.09709444e+01, -8.32606125e+00,  8.80208778e+00,
       -3.66994119e+00,  5.74467516e+00, -5.16278315e+00,  7.51706481e-01,
       -1.68785405e+00, -4.08730030e-01, -2.30267668e+00,  1.22246706e+00,
        3.62242818e-01,  3.67817074e-01,  4.39834386e-01,  2.44691804e-01,
        2.47861996e-01,  3.35616410e-01,  3.64618778e-01,  4.35683727e-01,
        2.95985639e-01,  3.15080285e-01,  4.07008350e-01,  3.85100693e-01,
        1.59861156e+01,  1.50982033e+01,  1.84541494e+01,  1.83622426e+01,
        1.89174308e+01,  1.71898162e+01,  3.96550659e+01,  2.65477071e-02,
        1.82247680e-02,  4.86466895e-03, -1.66346344e-02,  9.77707730e-03,
       -4.21311936e-03,  8.30448207e-02,  1.78412264e+03,  2.00241241e+03])

## Repeat

Repeat the above process for every audio file.

In [None]:
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
print(genres[4:])
for i in genres[4:]:
    features = []
    for j in range(100):
        if j < 10:
            path = f'./Data/genres_original/{i}/{i}.0000{j}.wav'
        else:
            path = f'./Data/genres_original/{i}/{i}.000{j}.wav'

        try:
            y, sr = lb.load(path, sr=22050)
            
            # MFCC
            mfcc = lb.feature.mfcc(y=y, sr=sr, n_mfcc = 20)
            mfcc = np.mean(mfcc, axis = 1)
        
            # Chroma
            chroma = lb.feature.chroma_stft(y=y, sr=sr)
            chroma = np.mean(chroma, axis = 1)
        
            # Spectral contrast
            spr = lb.feature.spectral_contrast(y=y, sr = sr)
            spr = np.mean(spr, axis = 1)
        
            # Tonnetz
            ton = lb.feature.tonnetz(y=y,sr=sr)
            ton = np.mean(ton, axis = 1)
        
            # Zero-Crossing rate
            zcr = lb.feature.zero_crossing_rate(y)
            zcr = np.mean(zcr, axis = 1)
        
            # Spectral centroid
            spc = lb.feature.spectral_centroid(y=y, sr=sr)
            spc = np.mean(spc, axis = 1)
        
            # Spectral bandwidth
            spb = lb.feature.spectral_bandwidth(y=y, sr=sr)
            spb = np.mean(spb, axis = 1)
        
            arr = np.hstack((mfcc, chroma, spr, ton, zcr, spc, spb))
            features.append((arr, f'{i}'))
            if j % 10 == 0:
                print(f"{j}/100 files done!")
        except Exception:
            print(f"{j} from {i} genre skipped!")
    joblib.dump(features, f'{i}.pkl')

## Consolidate

Consolidate all genres' data and labels into a single df.

In [34]:
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
data = {genre: joblib.load(f'{genre}.pkl') for genre in genres}
features = []
labels = []
for i in genres:
    for (x,y) in data[i]:
        features.append(x)
        labels.append(y)
mfcc = [f'mfcc_{i+1}' for i in range(20)]
chroma = [f'chroma_{i+1}' for i in range(12)]
contrast = [f'spectral_contrast_{i+1}' for i in range(7)]
tonnetz = [f'tonnetz_{i+1}' for i in range(6)]
columns = [f'mfcc_{i+1}' for i in range(20)] + [f'chroma_{i+1}' for i in range(12)] + [f'spectral_contrast_{i+1}' for i in range(7)] + [f'tonnetz_{i+1}' for i in range(6)] + ['zero_crossing_rate', 'spectral_centroid', 'spectral_bandwidth']
df = pd.DataFrame(features, columns = columns)
df['labels'] = labels
joblib.dump(df, 'data.pkl')

['data.pkl']

## Shuffle and Split Data

In [39]:
df = joblib.load('data.pkl')
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for tr, te in split.split(df, df['labels']):
    train = df.iloc[tr]
    test = df.iloc[te]
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)

## Seperate Features and Labels

In [9]:
train = pd.read_csv('train.csv')
features = train.drop('labels', axis = 1)
labels = train['labels'].copy()
features

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,spectral_contrast_7,tonnetz_1,tonnetz_2,tonnetz_3,tonnetz_4,tonnetz_5,tonnetz_6,zero_crossing_rate,spectral_centroid,spectral_bandwidth
0,-179.800003,147.759232,-20.459309,58.603146,-23.800596,2.462271,-3.193963,-9.311168,-5.763920,-0.422212,...,33.238849,-0.073368,0.031914,0.020945,0.052324,0.000572,0.002034,0.054847,1317.026971,1475.601230
1,-19.763226,76.646378,-4.826523,32.238724,7.710495,17.183760,-1.495102,9.149327,2.757267,9.145537,...,20.085741,0.155051,-0.123967,-0.018200,0.146415,0.005915,-0.028871,0.104563,2505.851671,2574.313232
2,-202.223450,75.231430,-14.236297,66.306335,-8.287852,24.383793,-16.236677,17.128633,-21.286032,3.962978,...,40.529965,0.166372,-0.045911,-0.057948,0.016806,0.003366,0.020358,0.139119,2511.714306,2245.075054
3,-174.720779,75.894302,-12.799819,52.615498,-29.407379,22.755056,-22.368855,18.625555,-24.709671,21.171358,...,41.681401,0.018884,0.009707,-0.010968,-0.020466,0.002517,0.007819,0.181372,2915.202078,2420.424422
4,-235.765472,144.767288,-22.573074,62.554108,-9.306330,2.031492,-0.334849,1.998136,5.536539,0.306475,...,17.888740,-0.064465,-0.036573,0.025228,0.037819,0.012620,0.020583,0.076491,1337.777104,1518.833999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,-169.225113,132.786789,-13.487547,31.186213,6.340470,-3.755665,-2.059001,-3.625316,-4.561240,1.280983,...,16.262128,0.178420,-0.020760,-0.021259,-0.114723,-0.011897,-0.003240,0.059815,1474.273987,1886.007128
795,-123.671570,94.753395,-9.552752,53.266991,-13.762559,17.746737,-19.215258,25.295883,-18.705511,15.396704,...,41.558311,0.067030,-0.061386,-0.007251,-0.049448,-0.005936,0.005715,0.119886,2414.161865,2339.259342
796,-29.043100,51.860497,-7.567608,21.507437,8.141378,18.118086,6.894575,6.569047,0.711364,11.159763,...,17.008133,0.045675,0.014822,-0.015426,0.045968,0.001940,-0.002976,0.188552,3292.704550,2768.039085
797,-46.762787,103.700768,22.126631,9.223701,2.258480,-4.794883,4.798995,0.951538,-9.411774,-1.066728,...,15.043172,0.141158,-0.027316,0.003248,0.010238,-0.001262,-0.004489,0.090227,2362.411632,2806.845504


## **Training**

## Cross Validation

Compare and select the best model using cross_val_score from sklearn.model_selection

### 1. Decision Trees

We use "accuracy" metric for classifiers. cv = 5 is fine for classifiers.

In [18]:
dt = DecisionTreeClassifier(random_state=42)
dt_acc = cross_val_score(dt, features, labels, scoring="accuracy", cv = 5)
pd.Series(dt_acc).describe()

count    5.000000
mean     0.448019
std      0.035997
min      0.412500
25%      0.415094
50%      0.456250
75%      0.456250
max      0.500000
dtype: float64

### 2. Random forests

n_estimators is the number of trees for voting. More trees → more stable but slower.\
n_jobs: number of cpu cores which can be used. -1 stands for all cores.

In [17]:
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf_acc = cross_val_score(rf, features, labels, scoring="accuracy", cv=5)
pd.Series(rf_acc).describe()

count    5.000000
mean     0.670810
std      0.046529
min      0.625000
25%      0.647799
50%      0.650000
75%      0.687500
max      0.743750
dtype: float64

### 3. Support vector machines (SVM)

If scaling is done before cross-validation, the test folds could “see” information from the training folds, leading to overly optimistic accuracy. A pipeline prevents this data leakage by applying scaling separately within each fold, while also simplifying the code, since you don’t need to manually transform the training and test sets.

In [21]:
pipeline = Pipeline([
('scaler', StandardScaler()),
('SVM', SVC())
])
svc_acc = cross_val_score(pipeline, features, labels, scoring="accuracy", cv = 5)
pd.Series(svc_acc).describe()

count    5.000000
mean     0.647028
std      0.019772
min      0.622642
25%      0.637500
50%      0.643750
75%      0.656250
max      0.675000
dtype: float64

## Conclusion

Decision Tree is disqualified.\
I would pick SVM over Random Forest. It's not better than Random Forest, but it's stable. It's like comparing Trading and Mutual Funds.