In [11]:
import pandas as pd
import numpy as np
import math
import logging
import importlib

import sys
import os

import matplotlib.pyplot as plt
import mlflow

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from mlflow import log_metric, log_param, log_artifact
from sklearn.ensemble import StackingClassifier
from inspect import getsource
import inspect

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import Pipeline

import tensorflow as tf

import tempfile


In [2]:
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data.load_data import load_data
from data.split_data import split_data
from data.clean_data import DataCleaner
from data.string_toInt_encoder import StringToIntEncoder
from data.oversample_minority import OversampleMinority
from data.undersample_majority import UndersampleMajority
from data.featur_target_pipline import FeatureTargetPipeline
from data.target_binarizer import TargetBinarizer
from data.dummies_encoder import SafeOneHotEncoder
from data.artist_popularity import ArtistPopularityEncoder
from data.balance_simpler import BalancedResampler


from predict.acuuracy import ModelEvaluator

from train.mlflow_experiment_runner import MLflowExperimentRunner
# from train.model_evaluator import ModelEvaluator


import data.load_data
import data.split_data
import data.clean_data
import data.string_toInt_encoder
import data.oversample_minority
import data.undersample_majority
import data.featur_target_pipline
import data.dummies_encoder
import data.artist_popularity
import data.balance_simpler

import predict.acuuracy

# import train.model_evaluator
import train.mlflow_experiment_runner


importlib.reload(data.load_data)
importlib.reload(data.split_data)
importlib.reload(data.clean_data)
importlib.reload(data.string_toInt_encoder)
importlib.reload(data.oversample_minority)
importlib.reload(data.undersample_majority)
importlib.reload(data.dummies_encoder)
importlib.reload(data.featur_target_pipline)
importlib.reload(data.target_binarizer)
importlib.reload(data.artist_popularity)
importlib.reload(data.balance_simpler)

# importlib.reload(train.model_evaluator)
importlib.reload(train.mlflow_experiment_runner)


importlib.reload(predict.acuuracy)



<module 'predict.acuuracy' from '/Users/level3/mlops_spotify/spotify-1million/src/predict/acuuracy.py'>

In [69]:
data_folder = '../data'
df = load_data(data_folder)


Loading data...
Downloading dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks
Download complete.
Loading CSV data...
Data loaded successfully.


In [70]:
df = df.drop("Unnamed: 0", axis = 1)
df_2023 = df[df['year'] == 2023]
df = df[df['year'] != 2023]



In [71]:
path_2023 = os.path.join(data_folder,"data_2023", "data_2023.csv")
os.makedirs(os.path.dirname(path_2023), exist_ok=True)
df_2023.to_csv(path_2023, index=False)

In [72]:
target = df['popularity']
y = TargetBinarizer(threshold=50).fit_transform(target)
X = df.drop(columns='popularity')
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

In [73]:
print(X_test.shape)
print(y_test.shape)

(168168, 18)
(168168,)


In [74]:
X_test = X_test.reset_index(drop=True)
y_test = pd.Series(y_test, name="popularity").reset_index(drop=True)

In [75]:
y_test = pd.Series(y_test, name="popularity") 
test_df = pd.concat([X_test, y_test], axis=1)
test_df

Unnamed: 0,artist_name,track_name,track_id,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity
0,David Arkenstone,Relax And Release,5RlGLoIk2F194gbBb2M146,2019,new-age,0.144,0.11100,2,-23.697,1,0.0386,0.160000,0.885000,0.4770,0.0274,143.354,510520,3,0
1,One True God,Dirty,6FPO395HkAYuM60gB6Crcp,2019,club,0.769,0.84600,1,-2.980,1,0.0419,0.000188,0.257000,0.1100,0.5580,123.998,129677,4,0
2,Mallrat,Better,3wkel51RjTQKywZYA7Knov,2018,electro,0.549,0.70300,1,-5.519,1,0.0265,0.110000,0.000000,0.0957,0.1450,103.949,193924,4,0
3,Armagedda,Likvaka,2CqRSE8NUOWQPeLiIILe8S,2020,black-metal,0.430,0.97300,0,-6.862,1,0.1040,0.000030,0.165000,0.7540,0.0498,119.964,431188,4,0
4,Disfiguring The Goddess,Black Earth Child,1eag1Ij89Mt7iXCIjTCiVA,2013,death-metal,0.406,0.93500,11,-2.953,1,0.1170,0.000230,0.818000,0.3460,0.1990,115.060,176987,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168163,Marcell,Takkan Terganti,0T4t1PywlNmJGcveGH5spB,2011,jazz,0.358,0.27500,5,-8.324,0,0.0277,0.704000,0.000001,0.1100,0.1170,103.581,241763,4,1
168164,Chris Coco,Albatross - Christian J Mix,1PXXiobCAz7IHhQTNBSMnR,2002,chill,0.619,0.93500,6,-10.518,1,0.0406,0.005920,0.754000,0.8740,0.5120,134.422,467023,4,0
168165,Piano Peace,A Clear Mind,2H7IS6HjM5vW9lNjEJEk4a,2018,piano,0.194,0.00346,5,-31.752,1,0.0370,0.990000,0.799000,0.1180,0.0522,69.002,258125,4,0
168166,M. S. Sheela,Ta Tha Tadhimi - Brindavani - Adi,0kt7AfnDYvMm86YQ51MudY,2006,indian,0.567,0.57000,8,-9.597,0,0.0882,0.803000,0.000000,0.1020,0.7800,150.206,196200,4,0


In [76]:
test_path = os.path.join(data_folder,"test", "test.csv")
os.makedirs(os.path.dirname(test_path), exist_ok=True)
test_df.to_csv(test_path, index=False)


In [77]:
base_preprocessor = Pipeline([
    ('artist_encoder', ArtistPopularityEncoder()),
    ('data_cleaner', DataCleaner()),
    ('encoder', SafeOneHotEncoder(columns=['genre', 'key', 'time_signature'])),
    ('scaler', StandardScaler()),
])

In [78]:
# Example of how to use the MLflowExperimentRunner class with your experiment configuration

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np


# 1. Create an instance of the experiment runner
runner = MLflowExperimentRunner(
    experiment_name="Spotify_Popularity",
    tracking_uri="http://localhost:5000",
    evaluator_class=ModelEvaluator  # Set default evaluator class
)

# 2. Define your experiment configurations
experiments = [
    {
        "model": XGBClassifier(),
        "sampler": BalancedResampler,  # Your custom resampler class
        "preprocessor": base_preprocessor,  # Your preprocessing pipeline
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val,
        "model_params": {
            'n_estimators': 300,
            'max_depth': 8,
            'learning_rate': 0.1,
            'scale_pos_weight': 2
        },
        "sampler_params": {},
        "dataset_version": "2024-03-v2",
        "input_example": X_train.sample(5),
        "metadata": {
            "business_impact": "high",
            "owner": "data-team",
            "description": "XGBoost with class balancing"
        },
        "tags": {
            "stage": "production-candidate",
            "data_source": "spotify-api"
        },
        # "registered_model_name": "SpotifyPopularityClassifier",  # Uncomment to register
        "log_artifacts": True
    },
    # You can add more experiment configurations here
    # {
    #     "model": RandomForestClassifier(),
    #     "preprocessor": base_preprocessor,  # Your preprocessing pipeline
    #     "sampler": BalancedResampler,
    #     "X_train": X_train,
    #     "y_train": y_train, 
    #     "X_val": X_val,
    #     "y_val": y_val,
    #     "model_params": {
    #         'n_estimators': 200,
    #         'max_depth': 10,
    #         'min_samples_split': 5
    #     },
    #     "sampler_params": {},
    #     "metadata": {
    #         "description": "Random Forest baseline model"
    #     },
    #     "tags": {
    #         "stage": "baseline"
    #     }
    # }
]

# 3. Run all experiments at once
results = runner.run_experiments(experiments)
for i, result in enumerate(results):
    print(f"Experiment {i+1} results: {result}")

# Or run them individually
# for exp in experiments:
#     result = runner.run_experiment(**exp)
#     print(f"Experiment results: {result}")



🏃 View run unruly-fox-524 at: http://localhost:5000/#/experiments/400003216354571829/runs/f7fa6f8c9dff488c85c579136d328d47
🧪 View experiment at: http://localhost:5000/#/experiments/400003216354571829
Experiment 1 results: {'accuracy': 0.8372460872460873, 'precision': 0.2029771175272553, 'recall': 0.8992834394904459, 'f1': 0.33119929625647543, 'roc_auc': np.float64(0.9401932955160185)}
