In [9]:
# %pip install numpy pandas scikit-learn xgboost matplotlib seaborn
# %pip install joblib

# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import PredefinedSplit

# For regression models
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# For evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from category_encoders import TargetEncoder


In [10]:
df = pd.read_csv('spotify_tracks_dataset_processed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [11]:
# Create a new target column for classification
df['popularity_class'] = df['popularity'].apply(lambda x: 0 if x <= 70 else 1)

# Separate features and target for classification
X = df.drop(['popularity', 'popularity_class'], axis=1)  # Do not use popularity for training
y = df[['popularity', 'popularity_class']]  # Target variable for (Classification + Regresssion)
y1 = df['popularity_class'] # Target variable for Classification

# Train-test split for just classification
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

# Target encoding for 'track_genre'
encoder = TargetEncoder(cols=['track_genre'])
X_train['track_genre'] = encoder.fit_transform(X_train['track_genre'], y1_train)
X_test['track_genre'] = encoder.transform(X_test['track_genre'])

# Drop 'Unnamed: 0' column
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Save the PCA model for future use
joblib.dump(pca, 'models/pca_model.joblib')
print("Saved PCA model to models/pca_model.joblib")

# The dataset is now ready for classification model training
# Example:
# best_model.fit(X_train_pca, y_train)
# y_pred = best_model.predict(X_test_pca)

# Evaluate model performance on test set
# Example metrics:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))


Saved PCA model to models/pca_model.joblib


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier


# Dictionary of classification models and their respective hyperparameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42),
        'params': {
            'penalty': ['l1', 'l2', 'elasticnet', None],
            'C': np.logspace(-4, 4, 20),
            'solver': ['liblinear', 'saga'],
            'max_iter': [100, 200, 500, 1000]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5],
            'max_features': ['auto', 'sqrt', 'log2', None]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'SVC': {
        'model': SVC(random_state=42),
        'params': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto']
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': np.logspace(-9, -1, 10)
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300, 400, 500],
            'learning_rate': [0.001, 0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6],
            'subsample': [0.6, 0.8, 1.0],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300, 400, 500],
            'learning_rate': [0.001, 0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'gamma': [0, 0.1, 0.2, 0.3],
            'reg_alpha': [0, 0.1, 0.5],
            'reg_lambda': [1, 1.5, 2.0]
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(random_state=42),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],
            'activation': ['tanh', 'relu'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive']
        }
    },
    'BaggingClassifier': {
        'model': BaggingClassifier(random_state=42),
        'params': {
            'n_estimators': [10, 20, 30, 50, 100],
            'max_samples': [0.5, 0.7, 1.0],
            'max_features': [0.5, 0.7, 1.0],
            'bootstrap': [True, False],
            'bootstrap_features': [True, False]
        }
    },
    'VotingClassifier': {
        'model': VotingClassifier(estimators=[]),
        'params': {
            'voting': ['hard', 'soft'],
            'weights': [[1, 1, 1], [1, 2, 1], [2, 1, 1]]
        }
    }
}


In [None]:
# -----------------------------
# 4. Hyperparameter Tuning and Model Evaluation for Classification
# -----------------------------

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Initialize a list to store summary results for classification
classification_summary_results = []

# Create directories to store cross-validation results, models, and plots if they don't exist
os.makedirs('cv_results_classification', exist_ok=True)
os.makedirs('models_classification', exist_ok=True)
os.makedirs('plots_classification', exist_ok=True)

# Iterate over each classification model for hyperparameter tuning
for model_name, mp in models.items():
    print(f"\n\nProcessing {model_name}...\n")
    model = mp['model']
    params = mp['params']
    
    # Initialize RandomizedSearchCV for classification models
    n_iter = 600  # Adjust based on available computational power and time
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=n_iter,
        scoring='accuracy',  # Primary metric for classification
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1,
    )
    
    # Fit the model with hyperparameter tuning
    try:
        search.fit(X_train_pca, y1_train)
    except Exception as e:
        print(f"Error during fitting {model_name}: {e}")
        continue  # Skip to the next model if there's an error
    
    # Save cross-validation results to CSV
    cv_results = pd.DataFrame(search.cv_results_)
    cv_results.to_csv(f'cv_results_classification/{model_name}_cv_results.csv', index=False)
    print(f"Saved cross-validation results for {model_name} to cv_results_classification/{model_name}_cv_results.csv")
    
    # Best estimator after tuning
    best_model = search.best_estimator_
    print(f"Best Parameters for {model_name}: {search.best_params_}")
    
    # Save the best model
    joblib.dump(best_model, f"models_classification/best_{model_name}.joblib")
    print(f"Saved the best {model_name} model to models_classification/best_{model_name}.joblib")
    
    # Predict on training and testing data
    y_train_pred = best_model.predict(X_train_pca)
    y_test_pred = best_model.predict(X_test_pca)
    
    # Calculate evaluation metrics for classification
    train_accuracy = accuracy_score(y1_train, y_train_pred)
    train_precision = precision_score(y1_train, y_train_pred, average='weighted', zero_division=0)
    train_recall = recall_score(y1_train, y_train_pred, average='weighted', zero_division=0)
    train_f1 = f1_score(y1_train, y_train_pred, average='weighted', zero_division=0)
    
    test_accuracy = accuracy_score(y1_test, y_test_pred)
    test_precision = precision_score(y1_test, y_test_pred, average='weighted', zero_division=0)
    test_recall = recall_score(y1_test, y_test_pred, average='weighted', zero_division=0)
    test_f1 = f1_score(y1_test, y_test_pred, average='weighted', zero_division=0)
    
    # Append the results to the summary list
    classification_summary_results.append({
        'Model': model_name,
        'Best Parameters': search.best_params_,
        'Train Accuracy': train_accuracy,
        'Train Precision': train_precision,
        'Train Recall': train_recall,
        'Train F1 Score': train_f1,
        'Test Accuracy': test_accuracy,
        'Test Precision': test_precision,
        'Test Recall': test_recall,
        'Test F1 Score': test_f1,
    })
    
    print(f"Evaluation Metrics for {model_name}:")
    print(f"Train Accuracy: {train_accuracy:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train F1 Score: {train_f1:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1 Score: {test_f1:.4f}")




Processing LogisticRegression...

Fitting 5 folds for each of 600 candidates, totalling 3000 fits




Saved cross-validation results for LogisticRegression to cv_results_classification/LogisticRegression_cv_results.csv
Best Parameters for LogisticRegression: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 200, 'C': np.float64(0.0018329807108324356)}
Saved the best LogisticRegression model to models_classification/best_LogisticRegression.joblib
Evaluation Metrics for LogisticRegression:
Train Accuracy: 0.9685, Train Precision: 0.9540, Train Recall: 0.9685, Train F1 Score: 0.9543
Test Accuracy: 0.9673, Test Precision: 0.9543, Test Recall: 0.9673, Test F1 Score: 0.9525


Processing DecisionTreeClassifier...

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Saved cross-validation results for DecisionTreeClassifier to cv_results_classification/DecisionTreeClassifier_cv_results.csv
Best Parameters for DecisionTreeClassifier: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'entropy'}
Saved the best DecisionTreeClassifie

In [None]:
# -----------------------------
# 5. Save Summary of All Models
# -----------------------------

# Save the summary results to a file
classification_summary_df = pd.DataFrame(classification_summary_results)
classification_summary_df.to_csv('classification_summary_results.csv', index=False)
print("\nSaved classification summary results to classification_summary_results.csv")

# Save the best results to a separate file
best_classification_results = classification_summary_df.sort_values(by='Test Accuracy', ascending=False).head(1)
best_classification_results.to_csv('best_classification_results.csv', index=False)
print("\nSaved best classification result to best_classification_results.csv")

In [None]:
# -----------------------------
# 6. (Optional) Visualization of Results
# -----------------------------

# Example: Plot Test CE by Model
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Test Accuracy', data=results_df)
plt.xticks(rotation=45)
plt.title('Test Accuracy by Model')
plt.tight_layout()
plt.savefig('plots/Test_Accuracy_by_Model.png')
plt.show()

In [None]:
# Example: Plot Test R² by Model
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Test Precision', data=results_df)
plt.xticks(rotation=45)
plt.title('Test Precision by Model')
plt.tight_layout()
plt.savefig('plots/Test_Precision_by_Model.png')
plt.show()

In [None]:
# Example: Plot Test R² by Model
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Test Recall', data=results_df)
plt.xticks(rotation=45)
plt.title('Test Recall by Model')
plt.tight_layout()
plt.savefig('plots/Test_Recall_by_Model.png')
plt.show()

In [None]:
# Example: Plot Test R² by Model
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Test F1 Score', data=results_df)
plt.xticks(rotation=45)
plt.title('TTest F1 Score by Model')
plt.tight_layout()
plt.savefig('plots/Test_F1_Score_by_Model.png')
plt.show()

In [None]:
# -----------------------------
# 7. Now based on the classification results, create a new dataframe and store on the rows corrwesponding to the predicted popularity class = 1
# -----------------------------

# Convert X_test_pca to a DataFrame if it's not already
X_test_pca_df = pd.DataFrame(X_test_pca)

# Create a DataFrame with predicted and actual labels
test_results = X_test_pca_df.copy()
test_results['Predicted_Class'] = y_test_pred
test_results['Actual_Class'] = y1_test.values  # Ensure y1_test is aligned as a Series
test_results['Actual_Popularity'] = popularity.values  # Ensure popularity is aligned as a Series

# Filter rows where the predicted class is 1
filtered_results = test_results[test_results['Predicted_Class'] == 1]

# Save or use the filtered results
print(filtered_results.head())  # Display the first few rows