In [1]:
# Import Dependencies
import os
import numpy as np
import pandas as pd
import utils as utils
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import and read the data
df_raw = pd.read_csv("data/top_10000_1960-now.csv")
df_raw.head()

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,Justified & Ancient - Stand by the Jams,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,spotify:album:4MC0ZjNtVP1nDD5lsLxFjc,Songs Collection,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,1992-08-03,https://i.scdn.co/image/ab67616d0000b27355346b...,...,0.048,0.0158,0.112,0.408,0.504,111.458,4.0,,Jams Communications,"C 1992 Copyright Control, P 1992 Jams Communic..."
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,I Know You Want Me (Calle Ocho),spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,spotify:album:5xLAcbvbSAlRtPXnKkggXA,Pitbull Starring In Rebelution,spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,2009-10-23,https://i.scdn.co/image/ab67616d0000b27326d73a...,...,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0,,Mr.305/Polo Grounds Music/J Records,"P (P) 2009 RCA/JIVE Label Group, a unit of Son..."
2,spotify:track:70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:3WNxdumkSMGMJRhEgK80qx,...Baby One More Time (Digital Deluxe Version),spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,1999-01-12,https://i.scdn.co/image/ab67616d0000b2738e4986...,...,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0,,Jive,P (P) 1999 Zomba Recording LLC
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,Apeman - 2014 Remastered Version,spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,spotify:album:6lL6HugNEN4Vlc8sj0Zcse,"Lola vs. Powerman and the Moneygoround, Pt. On...",spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,2014-10-20,https://i.scdn.co/image/ab67616d0000b2731e7c53...,...,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0,,Sanctuary Records,"C © 2014 Sanctuary Records Group Ltd., a BMG C..."
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,You Can't Always Get What You Want,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,spotify:album:0c78nsgqX6VfniSNWIxwoD,Let It Bleed,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,1969-12-05,https://i.scdn.co/image/ab67616d0000b27373d927...,...,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0,,Universal Music Group,"C © 2002 ABKCO Music & Records Inc., P ℗ 2002 ..."


In [3]:
# list columns for features and target
df_raw.columns

Index(['Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)',
       'Album URI', 'Album Name', 'Album Artist URI(s)',
       'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',
       'Disc Number', 'Track Number', 'Track Duration (ms)',
       'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',
       'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',
       'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',
       'Label', 'Copyrights'],
      dtype='object')

In [4]:
# Drop unnecessary columns
df_data = df_raw.drop([#'Track URI',
                       'Track Name',
                       'Artist URI(s)',
                       'Artist Name(s)',
                       'Album URI',
                       'Album Name',
                       'Album Artist URI(s)',
                       'Album Artist Name(s)',
                       'Album Release Date',
                       'Album Image URL',
                       'Disc Number',
                       'Track Number',
                       'Track Duration (ms)',
                       'Track Preview URL',
                       #'Explicit',
                       'Popularity',
                       'ISRC',
                       'Added By',
                       'Added At',
                       #'Artist Genres',
                       #'Danceability',
                       #'Energy',
                       #'Key',
                       #'Loudness',
                       'Mode',
                       #'Speechiness',
                       #'Acousticness',
                       #'Instrumentalness',
                       #'Liveness',
                       #'Valence',
                       #'Tempo',
                       #'Time Signature',
                       #'Album Genres',
                       'Label',
                       'Copyrights'],
                       axis=1)

In [5]:
# Review remaining column names
df_data.columns 

Index(['Track URI', 'Explicit', 'Artist Genres', 'Danceability', 'Energy',
       'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres'],
      dtype='object')

In [6]:
# Columns renamed to follow convention
df_data = df_data.rename(columns={
                   'Track URI': 'track_uri',
                   #'Track Name': 'track_name',
                   #'Artist Name(s)': 'artist_name',
                   #'Album Name': 'album_name',
                   #'Album Release Date': 'release_date',
                   'Album Image URL': 'image',
                   'Explicit': 'explicit',
                   'Popularity': 'popularity',
                   'Artist Genres': 'artist_genres',
                   'Danceability': 'danceability',
                   'Energy': 'energy',
                   'Key': 'key',
                   'Speechiness': 'speechiness',
                   'Acousticness': 'acousticness',
                   'Instrumentalness': 'instrumentalness',
                   'Liveness': 'liveness',
                   'Valence': 'valence',
                   'Tempo': 'tempo',
                   'Time Signature': 'time_signature',
                   'Label': 'label'
       })


In [7]:
# Verify Update
df_data.columns

Index(['track_uri', 'explicit', 'artist_genres', 'danceability', 'energy',
       'key', 'Loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'Album Genres'],
      dtype='object')

In [None]:
# Dropping null columns
df_data = df_data.dropna(how="any")

In [None]:
# Reset index on dataframe
df_data = df_data.reset_index()

utils.eda(df_data)

In [None]:
# utils.plot_correlation_heatmap(df_data)

In [None]:
# utils.plot_numeric_distributions(df_data)

#### Cleaning and encoding the ['Artist Genres'] column

In [None]:
## Cleaning and encoding the 'artist genres' column
df_data.columns

In [None]:
## Cleaning and encoding the 'artist genres' column
df_data['artist_genres'].value_counts()

In [None]:
## Cleaning and encoding the 'artist genres' column
# how many unique genre combos are there?
df_data['artist_genres'].nunique()

In [None]:
## Cleaning and encoding the 'artist genres' column
# Add a space after any commas if one is not already present
df_data['artist_genres'] = df_data['artist_genres'].str.replace(
    r',(?=\S)', ', ', regex=True
    )


In [None]:
## Cleaning and encoding the 'artist genres' column
# Verify spaces added
df_data['artist_genres'].value_counts()

In [None]:
## Cleaning and encoding the 'artist genres' column
# replace spaces with and underscore where a letter character is on either side
df_data['artist_genres'] = df_data['artist_genres'].str.replace(
    r'(?<=[a-zA-Z]) (?=[a-zA-Z])', '_', regex=True
    )

In [None]:
## Cleaning and encoding the 'artist genres' column
# Verify underscores inserted
df_data['artist_genres'].value_counts()

In [None]:
## check for any numbers

In [None]:
## Cleaning and encoding the 'artist genres' column
# Split the ['artist_genres'] stings into lists
df_data['artist_genres'] = df_data['artist_genres'].str.split(', ')

In [None]:
## Cleaning and encoding the 'artist genres' column
# count unique values in ['artist_genres']
# Flatten the lists in the 'artist genres' column into a single list
flattened_list = [genre for sublist in df_data['artist_genres'] for genre in sublist]

# Convert the flattened list to a set to get unique values
unique_genres = set(flattened_list)

# Count the number of unique values
unique_genres_count = len(unique_genres)

print(f"Number of unique values in 'artist genres' column: {unique_genres_count}")


In [None]:
## Cleaning and encoding the 'artist genres' column
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [None]:
## Cleaning and encoding the 'artist genres' column
# fit and transform 'Artist Genres' column
encoded_genres = mlb.fit_transform(df_data['artist_genres'])
encoded_genres_df = pd.DataFrame(encoded_genres)
encoded_genres_df.head()

In [None]:
# reduce dimensionality before concatenating (too many columns to run efficiently later)
# Scale data with Standard Scaler
scaler = StandardScaler()

scaled_data = scaler.fit_transform(encoded_genres_df)

# call PCA
pca = PCA(n_components=5)

# fit and apply
genres_pca = pca.fit_transform(scaled_data)

# Create DataFrame with PCA results
genres_pca_df = pd.DataFrame(
    genres_pca,
    columns=['genre_pca_1', 'genre_pca_2', 'genre_pca_3', 'genre_pca_4', 'genre_pca_5']
)

genres_pca_df

In [None]:
## Cleaning and encoding the 'artist genres' column
# concatinate back into the original DataFrame
df_encoded = pd.concat([df_data.drop(columns=['artist_genres']), genres_pca_df], axis=1)

# Handle missing values (if any)
df_encoded.fillna(0, inplace=True)


In [None]:
utils.eda(df_encoded)

In [None]:
# Drop Extra index column created from Concatenating
df_encoded.drop(columns=['index'])

In [None]:
# Encode the ['Explicit'] column
df_encoded['explicit'] = df_encoded['explicit'].map({True: 1, False: 0})
df_encoded.head()

In [None]:
# Check column datatype

print(df_encoded['explicit'].dtype)

In [None]:
df_encoded.columns

In [None]:
# Drop all non-feature columns
df_encoded = df_encoded.drop(columns=['track_name',
                                      'artist_name',
                                      'album_name',
                                      'release_date',
                                      'image'],
                                      axis=1)


In [None]:
## Reduce the 'artist_genres' dimensionality with PCA
# create a reference list for the columns
df_encoded.columns

In [None]:
df_cleaned.head(5)

In [None]:
# Select only numeric columns for modeling
numeric_features = df_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Create feature matrix X
X = df_cleaned[numeric_features]

# Optional: Create new features
# Example: Combining features or creating ratios
X['energy_valence_ratio'] = X['Energy'] / X['Valence']

In [None]:
X.head()

In [None]:
# Remove the problematic energy_valence_ratio column if it exists
if 'energy_valence_ratio' in X.columns:
    X = X.drop('energy_valence_ratio', axis=1)

# Create the ratio feature with handling for zero values
X['energy_valence_ratio'] = X['Energy'] / X['Valence'].replace(0, 0.0001)  # Replace zeros with small value

# Now scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
# PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Or t-SNE for non-linear dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

In [None]:
# Using IQR method to detect outliers
def remove_outliers(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

# Apply outlier removal
X_no_outliers = remove_outliers(X_scaled, X_scaled.columns)

In [None]:
df_cleaned.head()

In [None]:
#unique_genres = set(df_cleaned['Artist Genres'].str.split(',').explode().value_counts())
unique_genres = df_cleaned['Artist Genres'].str.split(',').explode().value_counts()
print(len(unique_genres))
print(unique_genres.head(20))

In [None]:
# Looking at the error message and available columns, we see that 'Album Genres' doesn't exist
# Let's use 'Artist Genres' instead since we already have that data

# Get genre counts from the already exploded Artist Genres
genre_counts = df_cleaned['Artist Genres'].value_counts()

# Select top N genres (e.g., top 20)
top_n_genres = 20
top_genres = genre_counts.head(top_n_genres).index

# Create dummies only for top genres
genre_dummies = pd.get_dummies(
    df_cleaned['Artist Genres'].where(df_cleaned['Artist Genres'].isin(top_genres), 'other'),
    prefix='genre'
)

# Group by index and join with original dataframe
genre_dummies = genre_dummies.groupby(df_cleaned.index).sum()
df_with_top_genres = pd.concat([df_cleaned, genre_dummies], axis=1)

# No need to drop 'Album Genres' since it doesn't exist
# df_with_top_genres = df_with_top_genres.drop('Album Genres', axis=1)

print("\nShape with top genres only:", df_with_top_genres.shape)
print("\nTop genre columns:", genre_dummies.columns.tolist())

In [None]:
df_with_top_genres.info()

In [None]:
df_with_top_genres.head()

In [None]:
# Test the models
# Unsupervised models K-means, Gaussian 

In [None]:
# visualize model accuracy
# the elbow thing
# mushroom pizza
# 

In [None]:
# Build a GUI to display input/output
#
# 