## Genre Classifier - Find your genre based on your chosen features

### Imports

In [11]:
# Core libs
import numpy as np
import pandas as pd

# Scikit-learn – modelling & metrics
from sklearn.preprocessing   import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline        import Pipeline
from sklearn.metrics         import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)

# A quick, robust multi-class model
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import resample

In [2]:
bb = pd.read_csv('../../data/billboard_100_processed.csv')
bb.head()

Unnamed: 0,index,SongID,Performer,Song,spotify_genre,spotify_track_id,spotify_track_preview_url,spotify_track_duration_ms,spotify_track_explicit,spotify_track_album,...,Instance,Previous Week Position,Peak Position,Weeks on Chart,Year,Month,WeekStart,WeekIndex,Genre,inv_pos
0,0,......And Roses And RosesAndy Williams,Andy Williams,......And Roses And Roses,"['adult standards', 'brill building pop', 'eas...",3tvqPPpXyIgKrm4PR9HCf0,https://p.scdn.co/mp3-preview/cef4883cfd1e0e53...,166106.0,False,The Essential Andy Williams,...,1,,78,1,1965,April,1965-04-03,1,Easy Listening,0.012821
1,0,......And Roses And RosesAndy Williams,Andy Williams,......And Roses And Roses,"['adult standards', 'brill building pop', 'eas...",3tvqPPpXyIgKrm4PR9HCf0,https://p.scdn.co/mp3-preview/cef4883cfd1e0e53...,166106.0,False,The Essential Andy Williams,...,1,,78,1,1965,April,1965-04-03,1,Pop,0.012821
2,0,......And Roses And RosesAndy Williams,Andy Williams,......And Roses And Roses,"['adult standards', 'brill building pop', 'eas...",3tvqPPpXyIgKrm4PR9HCf0,https://p.scdn.co/mp3-preview/cef4883cfd1e0e53...,166106.0,False,The Essential Andy Williams,...,1,,78,1,1965,April,1965-04-03,1,Easy Listening,0.012821
3,0,......And Roses And RosesAndy Williams,Andy Williams,......And Roses And Roses,"['adult standards', 'brill building pop', 'eas...",3tvqPPpXyIgKrm4PR9HCf0,https://p.scdn.co/mp3-preview/cef4883cfd1e0e53...,166106.0,False,The Essential Andy Williams,...,1,,78,1,1965,April,1965-04-03,1,Easy Listening,0.012821
4,760,(The Bees Are For The Birds) The Birds Are For...,The Newbeats,(The Bees Are For The Birds) The Birds Are For...,['northern soul'],7eGLhn6AJNiXJUS7VawiUK,https://p.scdn.co/mp3-preview/c931d8624762f331...,133706.0,False,The Newbeats,...,1,,74,1,1965,April,1965-04-03,1,Soul,0.013514


In [3]:
bb.columns

Index(['index', 'SongID', 'Performer', 'Song', 'spotify_genre',
       'spotify_track_id', 'spotify_track_preview_url',
       'spotify_track_duration_ms', 'spotify_track_explicit',
       'spotify_track_album', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'spotify_track_popularity', 'url',
       'WeekID', 'Week Position', 'Instance', 'Previous Week Position',
       'Peak Position', 'Weeks on Chart', 'Year', 'Month', 'WeekStart',
       'WeekIndex', 'Genre', 'inv_pos'],
      dtype='object')

## Feature engineering, normalizing and balancing

In [15]:
# ------------------------------------------------------------
# 0.  DATA  ──  replace  `df`  by whatever you called the frame
# ------------------------------------------------------------
features = [
    "danceability", "energy", "key", "loudness", "mode",
    "speechiness", "acousticness", "instrumentalness", "liveness",
    "valence", "tempo", "time_signature"
]
X = bb[features].copy()
y = bb["Genre"].copy()

# Label-encode the target into 0 … n-1 integers
le   = LabelEncoder()
y_le = le.fit_transform(y)

# Train/test with **stratification** so every genre is represented
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y_le, test_size=0.20, random_state=42, stratify=y_le
)

# ------------------------------------------------------------
# 1.  PIPELINE
#     • Min-Max scaling → puts every feature on 0-to-1
#     • RandomOverSampler → clones minority-class samples
#     • RandomForest (≥ 300 trees) with class_weight *inside* each bootstrap
# ------------------------------------------------------------
scaler = MinMaxScaler()

clf    = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced_subsample",   # extra help for rare genres
    n_jobs=-1,
    random_state=42
)

# --- fit scaler on *train* only -------------------------------
X_tr_scaled = scaler.fit_transform(X_tr)
X_te_scaled = scaler.transform(X_te)

X_tr_scaled_df = pd.DataFrame(X_tr_scaled, columns=features)


# --- balance only the training split --------------------------
def simple_oversample(X, y, random_state=42):
    """Return an array where every class matches the size of the majority class."""
    rng   = np.random.RandomState(random_state)
    dfs   = []
    Xy    = pd.concat([X, pd.Series(y, name="y")], axis=1)
    max_n = Xy["y"].value_counts().max()

    for cls, group in Xy.groupby("y"):
        dfs.append(
            resample(
                group,
                replace=True,
                n_samples=max_n,
                random_state=rng,
            )
        )
    up = pd.concat(dfs, axis=0).sample(frac=1, random_state=rng)  # shuffle
    return up[features].values, up["y"].values

X_bal, y_bal = simple_oversample(X_tr_scaled_df, y_tr)

# --- train -----------------------------------------------------
clf.fit(X_bal, y_bal)

        danceability    energy       key  loudness  mode  speechiness  \
0           0.538540  0.845447  0.818182  0.839715   0.0     0.046429   
1           0.541582  0.931756  0.090909  0.805415   1.0     0.050866   
2           0.900609  0.578491  0.090909  0.670426   0.0     0.066234   
3           0.811359  0.664800  0.181818  0.798061   1.0     0.035065   
4           0.406694  0.638706  0.727273  0.712872   0.0     0.076082   
...              ...       ...       ...       ...   ...          ...   
613677      0.444219  0.349671  0.363636  0.410969   1.0     0.041126   
613678      0.626775  0.755123  0.636364  0.757891   0.0     0.048810   
613679      0.735294  0.488167  0.727273  0.727911   0.0     0.039827   
613680      0.978702  0.450031  0.000000  0.422315   1.0     0.256494   
613681      0.473631  0.939784  0.181818  0.672240   1.0     0.036688   

        acousticness  instrumentalness  liveness   valence     tempo  \
0           0.091813          0.000000  0.018831  0

KeyboardInterrupt: 

### 2 Evaluation – overall & per-class accuracy

In [16]:
y_pred = clf.predict(X_te_scaled)

# Overall accuracy
overall_acc = accuracy_score(y_te, y_pred)
print(f"Overall accuracy: {overall_acc:.3f}\n")

# Per-class precision / recall / f1 (accuracy ≈ recall here)
print("Per-class metrics:")
print(
    classification_report(
        y_te,
        y_pred,
        target_names=le.classes_,
        digits=3,
        zero_division=0   # avoids warnings for classes with no test samples
    )
)

# A quick per-class *accuracy* table
cm = confusion_matrix(y_te, y_pred, labels=np.arange(len(le.classes_)))
per_class_acc = cm.diagonal() / cm.sum(axis=1)
acc_table = pd.DataFrame({
    "Genre": le.classes_,
    "Accuracy": per_class_acc.round(3)
}).sort_values("Accuracy", ascending=False)

display(acc_table)


  


AttributeError: 'list' object has no attribute 'take'