In [None]:
# %pip install numpy pandas scikit-learn xgboost matplotlib seaborn
# %pip install joblib

# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import PredefinedSplit

# For regression models
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# For evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from category_encoders import TargetEncoder


In [3]:
df = pd.read_csv('spotify_tracks_dataset_processed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
# Create a new target column for classification
df['popularity_class'] = df['popularity'].apply(lambda x: 0 if x <= 70 else 1)

# Separate features and target for classification
X = df.drop(['popularity', 'popularity_class'], axis=1)  # Do not use popularity for training
y = df[['popularity', 'popularity_class']]  # Target variable for (Classification + Regresssion)
y1 = df['popularity_class'] # Target variable for Classification

# Train-test split for just classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y1_train = y_train['popularity_class']
y1_test = y_test['popularity_class']

# Target encoding for 'track_genre'
encoder = TargetEncoder(cols=['track_genre'])
X_train['track_genre'] = encoder.fit_transform(X_train['track_genre'], y1_train)
X_test['track_genre'] = encoder.transform(X_test['track_genre'])

# Drop 'Unnamed: 0' column
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

joblib.dump(pca, 'pca_model.joblib')

['pca_model.joblib']

In [5]:
X_train_pca.shape

(57556, 13)

In [6]:
y1_test[y1_test == 1]

44015    1
44107    1
36443    1
44058    1
22050    1
        ..
15069    1
33238    1
14981    1
33875    1
2161     1
Name: popularity_class, Length: 472, dtype: int64

In [13]:
from scipy.stats import randint
param_dist = {
    'n_estimators': randint(100, 1000),            # Number of trees between 100 and 1000
    'max_depth': randint(5, 50),                   # Maximum depth between 5 and 50
    'min_samples_split': randint(2, 11),           # Minimum samples split between 2 and 10
    'min_samples_leaf': randint(1, 11),            # Minimum samples leaf between 1 and 10
    'max_features': ['auto', 'sqrt', 'log2'],      # Categorical options
    'class_weight': [{0:1, 1:10}]     # Emphasize class 1 or balanced
}

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
from sklearn.model_selection import StratifiedKFold

# 10. Define cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 11. Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=1500,                # Number of parameter settings sampled
    cv=[(slice(None), slice(None))],
    scoring='f1',              # Focused on F1-score for balanced precision and recall
    random_state=42,
    n_jobs=-1,                 # Utilize all available cores
    verbose=2,                 # Verbosity mode
    return_train_score=True
)

# 12. Fit RandomizedSearchCV to the training data
print("\nStarting Randomized Search...")
random_search.fit(X_train_pca, y1_train)

# 13. Retrieve the best parameters and estimator
best_params = random_search.best_params_
print("\nBest Parameters Found:")
print(best_params)

best_rf_classifier = random_search.best_estimator_


Starting Randomized Search...
Fitting 1 folds for each of 500 candidates, totalling 500 fits

Best Parameters Found:
{'class_weight': {0: 1, 1: 10}, 'max_depth': 39, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 589}


In [17]:
from joblib import dump, load

In [18]:
# rf_classifier = RandomForestClassifier(**best_params)

# 4. Train the Model
# rf_classifier.fit(X_train_pca, y1_train)

# 5. Make Predictions
y_train_pred = best_rf_classifier.predict(X_train_pca)
y_test_pred = best_rf_classifier.predict(X_test_pca)
    
# Calculate evaluation metrics for classification
train_accuracy = accuracy_score(y1_train, y_train_pred)
train_precision = precision_score(y1_train, y_train_pred, average='weighted', zero_division=0)
train_recall = recall_score(y1_train, y_train_pred, average='weighted', zero_division=0)
train_f1 = f1_score(y1_train, y_train_pred, average='weighted', zero_division=0)

test_accuracy = accuracy_score(y1_test, y_test_pred)
test_precision = precision_score(y1_test, y_test_pred, average='weighted', zero_division=0)
test_recall = recall_score(y1_test, y_test_pred, average='weighted', zero_division=0)
test_f1 = f1_score(y1_test, y_test_pred, average='weighted', zero_division=0)

print(f"Train Accuracy: {train_accuracy:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train F1 Score: {train_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1 Score: {test_f1:.4f}")


Train Accuracy: 0.9996, Train Precision: 0.9996, Train Recall: 0.9996, Train F1 Score: 0.9996
Test Accuracy: 0.9671, Test Precision: 0.9533, Test Recall: 0.9671, Test F1 Score: 0.9544


In [19]:
dump(best_rf_classifier, 'classifier.joblib')

['classifier.joblib']

In [20]:
len(y_test_pred)

14389

In [21]:
# Convert X_test_pca to a DataFrame if it's not already
X_test_pca_df = pd.DataFrame(X_test_pca)

# Create a DataFrame with predicted and actual labels
test_results = X_test_pca_df.copy()
test_results['Predicted_Class'] = y_test_pred
test_results['Actual_Popularity'] = y_test['popularity'].values

filtered_results = test_results[test_results['Predicted_Class'] == 1]

In [22]:
test_results[test_results['Predicted_Class'] == 1].info()

<class 'pandas.core.frame.DataFrame'>
Index: 55 entries, 143 to 14205
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   0                  55 non-null     float64
 1   1                  55 non-null     float64
 2   2                  55 non-null     float64
 3   3                  55 non-null     float64
 4   4                  55 non-null     float64
 5   5                  55 non-null     float64
 6   6                  55 non-null     float64
 7   7                  55 non-null     float64
 8   8                  55 non-null     float64
 9   9                  55 non-null     float64
 10  10                 55 non-null     float64
 11  11                 55 non-null     float64
 12  12                 55 non-null     float64
 13  Predicted_Class    55 non-null     int64  
 14  Actual_Popularity  55 non-null     int64  
dtypes: float64(13), int64(2)
memory usage: 6.9 KB


In [23]:
filtered_results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55 entries, 143 to 14205
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   0                  55 non-null     float64
 1   1                  55 non-null     float64
 2   2                  55 non-null     float64
 3   3                  55 non-null     float64
 4   4                  55 non-null     float64
 5   5                  55 non-null     float64
 6   6                  55 non-null     float64
 7   7                  55 non-null     float64
 8   8                  55 non-null     float64
 9   9                  55 non-null     float64
 10  10                 55 non-null     float64
 11  11                 55 non-null     float64
 12  12                 55 non-null     float64
 13  Predicted_Class    55 non-null     int64  
 14  Actual_Popularity  55 non-null     int64  
dtypes: float64(13), int64(2)
memory usage: 6.9 KB


In [32]:
len(filtered_results[filtered_results['Actual_Popularity'] >= 70])

29

In [25]:
def regression_accuracy(y_true, y_pred, tolerance=10):
    """
    Calculates the percentage of predictions within a specified tolerance.
    
    Parameters:
    - y_true: Actual target values
    - y_pred: Predicted target values
    - tolerance: The maximum allowable difference between actual and predicted values
    
    Returns:
    - accuracy_percentage: Percentage of predictions within the tolerance
    """
    correct_predictions = np.abs(y_true - y_pred) <= tolerance
    accuracy_percentage = np.mean(correct_predictions) * 100
    return accuracy_percentage

# Define p_index_analysis function (if it is a custom function)
def p_index_analysis(y_true, y_pred):
    # Avoid division by zero
    non_zero = y_true != 0
    p_index = np.mean(np.abs(y_true[non_zero] - y_pred[non_zero]) / y_true[non_zero]) * 100
    return p_index

In [26]:
X = filtered_results.drop(['Predicted_Class', 'Actual_Popularity'], axis=1)
Y = filtered_results['Actual_Popularity']

In [27]:
reg = load('best_RandomForestRegressor.joblib')

y_pred_final = reg.predict(X)

rmse = np.sqrt(mean_squared_error(Y, y_pred_final))
mae = mean_absolute_error(Y, y_pred_final)
mape = mean_absolute_percentage_error(Y, y_pred_final)
r2 = r2_score(Y, y_pred_final)
accuracy = regression_accuracy(Y, y_pred_final, tolerance=5)
p_index = p_index_analysis(Y, y_pred_final)

print(f"Test RMSE: {rmse:.4f}, Test MAE: {mae:.4f}, Test MAPE: {mape:.2f}%, Test R2: {r2:.4f}, Test Accuracy: {accuracy:.2f}%, Test P Index: {p_index:.2f}")

Test RMSE: 36.3999, Test MAE: 26.5042, Test MAPE: 58333993249787400.00%, Test R2: -0.1717, Test Accuracy: 18.18%, Test P Index: 449.81
