### Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import sys
import os

# Specify the directory containing your module
module_dir = '/Users/gabrielvictorgomesferreira/artificial_intelligence/isu_classes/modules'

# Add the directory to sys.path if not already present
if module_dir not in sys.path:
    sys.path.append(module_dir)
    
from help_functions import *

# import statsmodels.api as sm
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score, accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Lasso, BayesianRidge, LassoCV, ElasticNetCV, RidgeCV, LogisticRegression
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.mixture import GaussianMixture
# from lightgbm import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
# from scipy.linalg import svd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer, f1_score, recall_score

### Config

In [16]:
data_loc = "/Users/gabrielvictorgomesferreira/Library/Mobile Documents/com~apple~CloudDocs/Work/ISU Classes/Data/"
file_name = "spotify_songs.csv"
models_loc = "../../models/"

### Import Dataset

In [24]:
songs = pd.read_csv(data_loc + file_name)
rows, columns = songs.shape
print(f"The dataset contains {rows:,} rows and {columns} columns")
songs.head(1)

The dataset contains 32,833 rows and 23 columns


Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754


### Data Preprocessing

In [25]:
def preprocesse_songs(df):
    df.drop(columns=['playlist_name', 'playlist_id'], inplace=True)
    df.drop_duplicates(subset=['track_id'], inplace=True)
    df = df[(df.duration_ms > df.duration_ms.quantile(0.01))]
    df.dropna(inplace=True)
    df['track_album_release_date'] = standardize_date(df['track_album_release_date'])
    df['release_year'] = df['track_album_release_date'].dt.year
    df = df.drop(columns=['track_album_release_date'])

    return df

def assign_mood(row):
    if row['valence'] > 0.6 and row['energy'] > 0.6:
        return 'Happy'
    elif row['energy'] > 0.6 and row['danceability'] > 0.5:
        return 'Energetic'
    elif row['energy'] <= 0.6 and row['acousticness'] > 0.13:
        return 'Relaxed'
    elif row['valence'] <= 0.4 and row['energy'] <= 0.4:
        return 'Melancholic'
    else:
        return 'Neutral'  # or other fallback label

In [5]:
from sklearn.preprocessing import LabelEncoder

# Label encoder
label_encoder = LabelEncoder()

# Call functions
songs = preprocesse_songs(songs)
songs['mood'] = songs.apply(assign_mood, axis=1)
songs['mood_numeric'] = label_encoder.fit_transform(songs['mood'])
songs.head(1)

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,release_year,mood,mood_numeric
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,2019,Energetic,0


In [21]:
dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

{'Energetic': 0, 'Happy': 1, 'Melancholic': 2, 'Neutral': 3, 'Relaxed': 4}

### Modeling

#### Input Features

In [18]:
# Input Data
X = songs[['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo','release_year']]
y = songs['mood_numeric']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

print(f'Training set dimensions: {X_train.shape}, {y_train.shape}')
print(f'Testing set dimensions: {X_test.shape}, {y_test.shape}')
print("Unique labels in y_train:", (y_train.nunique()))
print("Unique labels in y_test:", (y_test.nunique()))

Training set dimensions: (22454, 12), (22454,)
Testing set dimensions: (5614, 12), (5614,)
Unique labels in y_train: 5
Unique labels in y_test: 5


### Logistic Regression

In [8]:
# Sore model results
results = []

# Scale input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Initialize the model
logit_model = LogisticRegression()
logit_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = logit_model.predict(X_test_scaled)

# Evaluate model
lg_f1_score = f1_score(y_test, y_pred, average='weighted')
lg_recall_score = recall_score(y_test, y_pred, average='weighted')

# Cross validation using 5-fold cross-validation
lg_f1_cv_score = cross_val_score(logit_model, X_train_scaled, y_train, cv=5, scoring='f1_weighted').mean()
lg_recall_cv_score = cross_val_score(logit_model, X_train_scaled, y_train, cv=5, scoring='recall_weighted').mean()

results.append({"test_size": 0.2,
        "lg_f1_score": lg_f1_score,
        "lg_recall_score": lg_recall_score,
        "lg_f1_cv_score": lg_f1_cv_score,
        "lg_recall_cv_score": lg_recall_cv_score})

# Transform results into DF
results_df = pd.DataFrame(results)

# Display
results_df.head()

Unnamed: 0,test_size,lg_f1_score,lg_recall_score,lg_f1_cv_score,lg_recall_cv_score
0,0.2,0.812623,0.815283,0.823507,0.825287


In [9]:
# Applying the Lasso method for feature selection

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create the Lasso model and SelectFromModel
lasso = Lasso(alpha=0.01, random_state=38)
selector = SelectFromModel(lasso)

# Fit the model to the scaled data
selector.fit(X_scaled, y)

# Transform the features to select the important ones
X_selected = selector.transform(X_scaled)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=38)

# Get the selected features
selected_features_mask = selector.get_support() 
selected_features = X.columns[selected_features_mask]

# Get the fitted Lasso model and coefficients
lasso_model = selector.estimator_
lasso_coefficients = np.abs(lasso_model.coef_)

# Rank the features based on their absolute coefficients
ranking = np.argsort(lasso_coefficients)[::-1]  # Sort in descending order

# Create a DataFrame to display feature names along with their coefficients
feature_importance_df = pd.DataFrame({
    'Feature': X.columns[ranking],
    'Coefficient': lasso_coefficients[ranking]
})

# Filter to show only features with coefficients greater than 0.02
non_zero_features_df = feature_importance_df[feature_importance_df['Coefficient'] > 0.02]

# Display the DataFrame of ranked features and their coefficients
print("Ranked Features with by Coefficients:")
print(non_zero_features_df)

Ranked Features with by Coefficients:
            Feature  Coefficient
0            energy     0.998232
1      danceability     0.433179
2           valence     0.154090
3      acousticness     0.112739
4       speechiness     0.039312
5      release_year     0.032062
6  instrumentalness     0.024416


In [10]:
# Split data into features (X) and target variable (y)
X = X[selected_features]

# Initialize lists to store the metrics
results = []

# Define models
models = {
    # "NaiveBayes": GaussianNB(),
    # "MLP": MLPClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), 
    # "LightGBM": LGBMClassifier(verbose=-1), 
    "SVC": SVC(probability=True), 
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    # "QDA": QuadraticDiscriminantAnalysis(),
    "GradientBoosting": GradientBoostingClassifier()
}

# Test set sizes to iterate over
test_sizes = [0.3]
i=0
# Iterate over each test size
for test_size in test_sizes:
    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=45)

    # Scaling the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)  
    X_test_scaled = scaler.transform(X_test)

    # Iterate over each model
    for model_name, model in models.items():
        
        # Train the model on the scaled training set
        model.fit(X_train_scaled, y_train)

        # Predictions on training set
        y_train_pred = model.predict(X_train_scaled)

        # Predictions on test set
        y_test_pred = model.predict(X_test_scaled)

        # Accuracy Scores for training and test sets
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        
        # Cross-validation using Accuracy Scores
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
        cv_accuracy_mean = cv_scores.mean()  
        cv_accuracy_std = cv_scores.std()    

        # Store the results in a dictionary
        results.append({
            'Model': model_name,
            'Test_Size': test_size,
            'Train_Accuracy': round(train_acc,4),
            'Test_Accuracy': round(test_acc,4),
            'CV_Accuracy_Mean': round(cv_accuracy_mean,4),
            'CV_Accuracy_Std': round(cv_accuracy_std,4)
        })

        print(results[i])
        i+=1

# Create a DataFrame to display the metrics
metrics_df = pd.DataFrame(results)

# Display the metrics sorted by Test Accuracy
metrics_df.sort_values(by='Test_Accuracy', ascending=False, inplace=True)

# Show the resulting DataFrame
metrics_df

{'Model': 'AdaBoost', 'Test_Size': 0.3, 'Train_Accuracy': 0.4367, 'Test_Accuracy': 0.425, 'CV_Accuracy_Mean': 0.4367, 'CV_Accuracy_Std': 0.0046}
{'Model': 'LinearDiscriminantAnalysis', 'Test_Size': 0.3, 'Train_Accuracy': 0.8103, 'Test_Accuracy': 0.8184, 'CV_Accuracy_Mean': 0.8099, 'CV_Accuracy_Std': 0.0029}
{'Model': 'SVC', 'Test_Size': 0.3, 'Train_Accuracy': 0.9727, 'Test_Accuracy': 0.9629, 'CV_Accuracy_Mean': 0.9609, 'CV_Accuracy_Std': 0.0021}
{'Model': 'RandomForest', 'Test_Size': 0.3, 'Train_Accuracy': 1.0, 'Test_Accuracy': 0.9996, 'CV_Accuracy_Mean': 0.9995, 'CV_Accuracy_Std': 0.0004}
{'Model': 'KNN', 'Test_Size': 0.3, 'Train_Accuracy': 0.9366, 'Test_Accuracy': 0.8969, 'CV_Accuracy_Mean': 0.8868, 'CV_Accuracy_Std': 0.0035}
{'Model': 'GradientBoosting', 'Test_Size': 0.3, 'Train_Accuracy': 1.0, 'Test_Accuracy': 1.0, 'CV_Accuracy_Mean': 0.9999, 'CV_Accuracy_Std': 0.0001}


Unnamed: 0,Model,Test_Size,Train_Accuracy,Test_Accuracy,CV_Accuracy_Mean,CV_Accuracy_Std
5,GradientBoosting,0.3,1.0,1.0,0.9999,0.0001
3,RandomForest,0.3,1.0,0.9996,0.9995,0.0004
2,SVC,0.3,0.9727,0.9629,0.9609,0.0021
4,KNN,0.3,0.9366,0.8969,0.8868,0.0035
1,LinearDiscriminantAnalysis,0.3,0.8103,0.8184,0.8099,0.0029
0,AdaBoost,0.3,0.4367,0.425,0.4367,0.0046


### Select the best model

In [19]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

# Create and fit the gb model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Make predictions
gb_pred = gb_model.predict(X_test)
gb_pred_train = gb_model.predict(X_train)

# Evaluate the model with original categorical values
gb_accuracy = accuracy_score(y_test, gb_pred)
precision = precision_score(y_test, gb_pred, average='weighted')
recall = recall_score(y_test, gb_pred, average='weighted')
f1 = f1_score(y_test, gb_pred, average='weighted')
gb_clf_report_original = classification_report(y_test, gb_pred)

# Display results
print('Best Model - gb Test Scores:')
print(f'Accuracy: {gb_accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(' ')
print(gb_clf_report_original)

Best Model - gb Test Scores:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2857
           1       1.00      1.00      1.00      2529
           2       1.00      1.00      1.00        42
           3       1.00      1.00      1.00      1486
           4       1.00      1.00      1.00      1507

    accuracy                           1.00      8421
   macro avg       1.00      1.00      1.00      8421
weighted avg       1.00      1.00      1.00      8421



### Save Trained Model

In [20]:
import pickle

# Define the file path
mood_encoder_path = models_loc + "mood_encoder_model.pkl"
mood_gbmodel_path = models_loc + "mood_gb_model.pkl"


# Save the model
with open(mood_encoder_path, 'wb') as file:
    pickle.dump(label_encoder, file)

with open(mood_gbmodel_path, 'wb') as file:
    pickle.dump(gb_model, file)

### Clustering

In [27]:
from sklearn.cluster import KMeans

In [28]:
data_loc = "/Users/gabrielvictorgomesferreira/Library/Mobile Documents/com~apple~CloudDocs/Work/ISU Classes/Data/"
file_name = "spotify_songs.csv"
models_loc = "../../models/"

In [29]:
songs = pd.read_csv(data_loc + file_name)
rows, columns = songs.shape
print(f"The dataset contains {rows:,} rows and {columns} columns")
songs.head(1)

The dataset contains 32,833 rows and 23 columns


Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754


In [30]:
def preprocesse_songs(df):
    df.drop(columns=['playlist_name', 'playlist_id'], inplace=True)
    df.drop_duplicates(subset=['track_name','track_artist'], inplace=True)
    df = df[(df.duration_ms > df.duration_ms.quantile(0.01))]
    df.dropna(inplace=True)
    df['track_album_release_date'] = standardize_date(df['track_album_release_date'])
    df['release_year']  = df['track_album_release_date'].dt.year
    df = df.drop(columns=['track_album_release_date'])
    encoder = LabelEncoder()
    df['track_artist_label'] = encoder.fit_transform(df['track_artist'])
    df['track_album_id_label'] = encoder.fit_transform(df['track_album_id'])
    df['artist_track'] = df.apply(lambda x: f"{x['track_artist']} - {x['track_name']}", axis=1)
 
    return df

In [32]:
songs = preprocesse_songs(songs)
clustering_data =  songs[['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo','track_artist_label','release_year']]

In [33]:
kmeans = KMeans(n_clusters=8)
songs.loc[:, 'kmeans_labels'] = kmeans.fit_predict(clustering_data)
clustering_data.loc[:, 'kmeans_labels'] = kmeans.fit_predict(clustering_data)


python(37309) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [34]:
clustering_data

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_artist_label,release_year,kmeans_labels
0,0.748,0.916,6,-2.634,1,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,2759,2019,3
1,0.726,0.815,11,-4.969,1,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,6036,2019,0
2,0.675,0.931,1,-3.432,0,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,10350,2019,5
3,0.718,0.930,7,-3.778,1,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,9153,2019,2
4,0.650,0.833,1,-4.672,1,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,5361,2019,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32828,0.428,0.922,2,-1.814,1,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,5680,2014,0
32829,0.522,0.786,0,-4.462,1,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,9040,2013,2
32830,0.529,0.821,6,-4.899,0,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,8688,2014,2
32831,0.626,0.888,2,-3.361,1,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,6092,2014,0


In [35]:
import pickle

# Define the file path
kmeans_path = models_loc + "kmeans_model.pkl"


# Save the model
with open(kmeans_path, 'wb') as file:
    pickle.dump(kmeans, file)