## GMM clustering

In [3]:
# gmm_model.py
import pandas as pd
import os
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

# Define features
features = [
    'danceability', 'energy', 'key', 'loudness', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'
]

# Load dataset
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
file_path = os.path.join(parent_dir, "data", "spotify_cleaned.csv")
train_data = pd.read_csv(file_path)



In [4]:
# Sample 40% of the data
train_data = train_data.sample(frac=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(train_data[features])

# Gaussian Mixture Model
gmm_model = GaussianMixture(n_components=10, random_state=42)
train_data['cluster'] = gmm_model.fit_predict(X)

# Save the scaler and GMM model separately
model_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', 'models'))
model_path = os.path.join(model_dir, 'gmm_model.pkl')
scaler_path = os.path.join(model_dir, 'StandardScaler.pkl')

joblib.dump(scaler, scaler_path)
joblib.dump(gmm_model, model_path)

# Save the clustered training data
train_data.to_csv("gmm_model_data.csv")
train_data.head(5)

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,cluster
113185,Hillsong Worship,No Other Name,No Other Name,50,440247,False,0.369,0.598,7,-6.984,1,0.0304,0.00511,0.0,0.176,0.0466,148.014,4,world-music,3
42819,Internal Rot,Grieving Birth,Failed Organum,11,93933,False,0.171,0.997,7,-3.586,1,0.118,0.00521,0.801,0.42,0.0294,122.223,4,grindcore,4
59311,Zhoobin Askarieh;Ali Sasha,Noise A Noise 20.4-1,"Save the Trees, Pt. 1",0,213578,False,0.173,0.803,9,-10.071,0,0.144,0.613,0.00191,0.195,0.0887,75.564,3,iranian,8
90416,Billy Fury,A Thousand Stars,It's Only Make Believe,34,146706,False,0.419,0.382,9,-13.438,1,0.0322,0.32,0.0,0.106,0.462,84.185,4,rock-n-roll,3
61000,Nogizaka46,バレッタ TypeD,月の大きさ,57,236293,False,0.555,0.941,9,-3.294,0,0.0481,0.484,0.0,0.266,0.813,92.487,4,j-idol,7
