In [1]:
# Import Dependencies
import os
import numpy as np
import pandas as pd
import utils as utils
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import and read the data
df_raw = pd.read_csv("data/top_10000_1960-now.csv")
df_raw.head()

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,Justified & Ancient - Stand by the Jams,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,spotify:album:4MC0ZjNtVP1nDD5lsLxFjc,Songs Collection,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,1992-08-03,https://i.scdn.co/image/ab67616d0000b27355346b...,...,0.048,0.0158,0.112,0.408,0.504,111.458,4.0,,Jams Communications,"C 1992 Copyright Control, P 1992 Jams Communic..."
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,I Know You Want Me (Calle Ocho),spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,spotify:album:5xLAcbvbSAlRtPXnKkggXA,Pitbull Starring In Rebelution,spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,2009-10-23,https://i.scdn.co/image/ab67616d0000b27326d73a...,...,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0,,Mr.305/Polo Grounds Music/J Records,"P (P) 2009 RCA/JIVE Label Group, a unit of Son..."
2,spotify:track:70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:3WNxdumkSMGMJRhEgK80qx,...Baby One More Time (Digital Deluxe Version),spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,1999-01-12,https://i.scdn.co/image/ab67616d0000b2738e4986...,...,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0,,Jive,P (P) 1999 Zomba Recording LLC
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,Apeman - 2014 Remastered Version,spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,spotify:album:6lL6HugNEN4Vlc8sj0Zcse,"Lola vs. Powerman and the Moneygoround, Pt. On...",spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,2014-10-20,https://i.scdn.co/image/ab67616d0000b2731e7c53...,...,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0,,Sanctuary Records,"C © 2014 Sanctuary Records Group Ltd., a BMG C..."
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,You Can't Always Get What You Want,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,spotify:album:0c78nsgqX6VfniSNWIxwoD,Let It Bleed,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,1969-12-05,https://i.scdn.co/image/ab67616d0000b27373d927...,...,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0,,Universal Music Group,"C © 2002 ABKCO Music & Records Inc., P ℗ 2002 ..."


In [3]:
# list columns for features and target
df_raw.columns

Index(['Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)',
       'Album URI', 'Album Name', 'Album Artist URI(s)',
       'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',
       'Disc Number', 'Track Number', 'Track Duration (ms)',
       'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',
       'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',
       'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',
       'Label', 'Copyrights'],
      dtype='object')

In [4]:
# Drop unnecessary columns
# all columns listed, columns to keep are commented out.
df_data = df_raw.drop([#'Track URI',
                       'Track Name',
                       'Artist URI(s)',
                       'Artist Name(s)',
                       'Album URI',
                       'Album Name',
                       'Album Artist URI(s)',
                       'Album Artist Name(s)',
                       'Album Release Date',
                       'Album Image URL',
                       'Disc Number',
                       'Track Number',
                       'Track Duration (ms)',
                       'Track Preview URL',
                       #'Explicit',
                       'Popularity',
                       'ISRC',
                       'Added By',
                       'Added At',
                       #'Artist Genres',
                       #'Danceability',
                       #'Energy',
                       #'Key',
                       'Loudness',
                       'Mode',
                       #'Speechiness',
                       #'Acousticness',
                       #'Instrumentalness',
                       #'Liveness',
                       #'Valence',
                       #'Tempo',
                       #'Time Signature',
                       'Album Genres',
                       'Label',
                       'Copyrights'],
                       axis=1)

In [5]:
# Review remaining column names
df_data.columns 

Index(['Track URI', 'Explicit', 'Artist Genres', 'Danceability', 'Energy',
       'Key', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature'],
      dtype='object')

In [6]:
# Columns renamed to follow convention
df_data = df_data.rename(columns={
                   'Track URI': 'track_uri',
                   'Album Image URL': 'image',
                   'Explicit': 'explicit',
                   'Popularity': 'popularity',
                   'Artist Genres': 'artist_genres',
                   'Danceability': 'danceability',
                   'Energy': 'energy',
                   'Key': 'key',
                   'Speechiness': 'speechiness',
                   'Acousticness': 'acousticness',
                   'Instrumentalness': 'instrumentalness',
                   'Liveness': 'liveness',
                   'Valence': 'valence',
                   'Tempo': 'tempo',
                   'Time Signature': 'time_signature'
       })


In [7]:
# Verify Update
df_data.columns

Index(['track_uri', 'explicit', 'artist_genres', 'danceability', 'energy',
       'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature'],
      dtype='object')

In [8]:
# Dropping null columns
df_data = df_data.dropna(how="any")

In [9]:
# Reset index on dataframe
df_data = df_data.reset_index(drop=True)


In [10]:
# utils.plot_correlation_heatmap(df_data)

In [11]:
# utils.plot_numeric_distributions(df_data)

#### Cleaning and encoding the ['Artist Genres'] column

In [12]:
## Cleaning and encoding the 'artist genres' column
# Explore the values
df_data['artist_genres'].value_counts()

artist_genres
dance pop,pop                                                       254
australian rock                                                     243
pop                                                                 229
australian pop,australian talent show                                86
australian pop                                                       73
                                                                   ... 
australian dance,australian pop,nyc rap                               1
edm,house,indietronica,uk dance,art pop,metropopolis,nz pop,pop       1
uk contemporary r&b,uk pop,new jersey underground rap,trap queen      1
classic country pop,country,country rock,soft rock                    1
disco house,vocal house,dance pop,europop,new wave pop                1
Name: count, Length: 2815, dtype: int64

In [13]:
## Cleaning and encoding the 'artist genres' column
# how many unique genre combos are there?
# Explore the values
df_data['artist_genres'].nunique()

2815

In [14]:
## Cleaning and encoding the 'artist genres' column
# Add a space after any commas if one is not already present
df_data['artist_genres'] = df_data['artist_genres'].str.replace(
    r',(?=\S)', ', ', regex=True
    )


In [15]:
## Cleaning and encoding the 'artist genres' column
# Verify spaces added
df_data['artist_genres'].value_counts()

artist_genres
dance pop, pop                                                            254
australian rock                                                           243
pop                                                                       229
australian pop, australian talent show                                     86
australian pop                                                             73
                                                                         ... 
australian dance, australian pop, nyc rap                                   1
edm, house, indietronica, uk dance, art pop, metropopolis, nz pop, pop      1
uk contemporary r&b, uk pop, new jersey underground rap, trap queen         1
classic country pop, country, country rock, soft rock                       1
disco house, vocal house, dance pop, europop, new wave pop                  1
Name: count, Length: 2815, dtype: int64

In [16]:
## Cleaning and encoding the 'artist genres' column
# replace spaces with and underscore where a letter character is on either side
df_data['artist_genres'] = df_data['artist_genres'].str.replace(
    r'(?<=[a-zA-Z]) (?=[a-zA-Z])', '_', regex=True
    )

In [17]:
## Cleaning and encoding the 'artist genres' column
# Verify underscores inserted
df_data['artist_genres'].value_counts()

artist_genres
dance_pop, pop                                                            254
australian_rock                                                           243
pop                                                                       229
australian_pop, australian_talent_show                                     86
australian_pop                                                             73
                                                                         ... 
australian_dance, australian_pop, nyc_rap                                   1
edm, house, indietronica, uk_dance, art_pop, metropopolis, nz_pop, pop      1
uk_contemporary_r&b, uk_pop, new_jersey_underground_rap, trap_queen         1
classic_country_pop, country, country_rock, soft_rock                       1
disco_house, vocal_house, dance_pop, europop, new_wave_pop                  1
Name: count, Length: 2815, dtype: int64

In [18]:
## Cleaning and encoding the 'artist genres' column
# Split the ['artist_genres'] stings into lists
df_data['artist_genres'] = df_data['artist_genres'].str.split(', ')

In [19]:
df_data.head()

Unnamed: 0,track_uri,explicit,artist_genres,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,False,"[acid_house, ambient_house, big_beat, hip_house]",0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458,4.0
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,False,"[dance_pop, miami_hip_hop, pop]",0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0
2,spotify:track:70XtWbcVZcpaOddJftMcVi,False,"[dance_pop, pop]",0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,False,"[album_rock, art_rock, british_invasion, class...",0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,False,"[album_rock, british_invasion, classic_rock, r...",0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0


In [20]:
## Cleaning and encoding the 'artist genres' column
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [21]:
## Cleaning and encoding the 'artist genres' column
# fit and transform 'Artist Genres' column
encoded_genres = mlb.fit_transform(df_data['artist_genres'])
df_encoded_genres = pd.DataFrame(encoded_genres)
df_encoded_genres.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,898,899,900,901,902,903,904,905,906,907
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
## Cleaning and encoding the 'artist genres' column
# concatenate back into the original DataFrame
df_encoded = pd.concat([df_data.drop(columns=['artist_genres']), df_encoded_genres], axis=1)

# Handle missing values (if any)
#df_encoded.fillna(0, inplace=True)


In [23]:
# Encode the ['explicit'] column
df_encoded['explicit'] = df_encoded['explicit'].map({True: 1, False: 0})
df_encoded.head()

Unnamed: 0,track_uri,explicit,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,...,898,899,900,901,902,903,904,905,906,907
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,0,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,...,0,0,0,0,0,0,0,0,0,0
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,0,0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,...,0,0,0,0,0,0,0,0,0,0
2,spotify:track:70XtWbcVZcpaOddJftMcVi,0,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,...,0,0,0,0,0,0,0,0,0,0
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,0,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,...,0,0,0,0,0,0,0,0,0,0
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,0,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df_encoded.dtypes

track_uri        object
explicit          int64
danceability    float64
energy          float64
key             float64
                 ...   
903               int32
904               int32
905               int32
906               int32
907               int32
Length: 920, dtype: object

In [25]:
# Create features dataframe
# Set column names as strings
df_x = df_encoded.drop(columns='track_uri')
df_x.columns = df_x.columns.astype(str)

In [30]:
df_data.head()

Unnamed: 0,track_uri,explicit,artist_genres,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,False,"[acid_house, ambient_house, big_beat, hip_house]",0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458,4.0
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,False,"[dance_pop, miami_hip_hop, pop]",0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0
2,spotify:track:70XtWbcVZcpaOddJftMcVi,False,"[dance_pop, pop]",0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,False,"[album_rock, art_rock, british_invasion, class...",0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,False,"[album_rock, british_invasion, classic_rock, r...",0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0


In [34]:
# Running pca without genres column
# # Scale data with Standard Scaler
scaler = StandardScaler()

df_test = df_data.drop(columns=['track_uri', 'explicit', 'artist_genres'])

#scaled_data = scaler.fit_transform(df_test)

# call PCA
pca = PCA(n_components=2)

# fit and apply
genres_pca = pca.fit_transform(df_test)

# Create DataFrame with PCA results
# genres_pca_df = pd.DataFrame(
#     genres_pca,
#     columns=['genre_pca_1',
#              'genre_pca_2'
#              ])

genres_pca_df

Unnamed: 0,genre_pca_1,genre_pca_2
0,0.946568,-0.865272
1,1.805102,0.610497
2,-0.587550,1.457076
3,0.455224,1.826989
4,-2.100471,-0.153940
...,...,...
9444,0.033262,-0.672429
9445,1.549117,-0.230894
9446,1.239010,-0.323031
9447,0.811548,-1.373733


In [35]:
pca.explained_variance_ratio_

array([0.98140791, 0.01820349])

In [27]:
# Scale data with Standard Scaler
scaler = StandardScaler()

scaled_data = scaler.fit_transform(df_x)

# call PCA
pca = PCA(n_components=20)

# fit and apply
genres_pca = pca.fit_transform(scaled_data)

# Create DataFrame with PCA results
genres_pca_df = pd.DataFrame(
    genres_pca,
    columns=['genre_pca_1',
             'genre_pca_2',
             'genre_pca_3',
             'genre_pca_4',
             'genre_pca_5',
             'genre_pca_6',
             'genre_pca_7',
             'genre_pca_8',
             'genre_pca_9',
             'genre_pca_10',
             'genre_pca_11',
             'genre_pca_12',
             'genre_pca_13',
             'genre_pca_14',
             'genre_pca_15',
             'genre_pca_16',
             'genre_pca_17',
             'genre_pca_18',
             'genre_pca_19',
             'genre_pca_20'
             ])

genres_pca_df

Unnamed: 0,genre_pca_1,genre_pca_2,genre_pca_3,genre_pca_4,genre_pca_5,genre_pca_6,genre_pca_7,genre_pca_8,genre_pca_9,genre_pca_10,genre_pca_11,genre_pca_12,genre_pca_13,genre_pca_14,genre_pca_15,genre_pca_16,genre_pca_17,genre_pca_18,genre_pca_19,genre_pca_20
0,-0.392433,-2.151775,-0.129742,-0.247454,-1.247140,1.099648,-0.534957,-0.266260,-0.304624,0.953635,0.379765,0.166742,-2.957952,-0.174637,2.409179,-1.107043,-2.770851,-3.287700,-2.306844,0.934316
1,-1.658729,-0.024572,-0.036570,-0.100449,-0.281529,-0.139698,-0.370680,-0.209656,0.298801,-0.147761,0.118259,0.017860,-0.613220,0.133350,-0.236727,0.064304,-0.535948,-0.318632,0.574316,-0.706447
2,-0.626919,-0.462173,-0.200438,0.285767,0.052729,-0.091732,-0.759085,-0.067279,0.063422,-0.569329,-0.016208,-0.096880,-0.270644,0.100545,-0.194888,0.048647,-0.345947,0.058273,0.471527,-0.383449
3,6.196573,5.623548,1.605647,-2.039624,1.993895,0.429693,0.739134,0.092709,0.597412,-0.712488,-0.092005,-0.181286,1.151414,0.160241,1.490883,-0.288724,-0.898651,-1.230617,0.446767,-0.780633
4,2.855155,1.507154,0.365705,-0.488822,0.398825,0.109090,0.345616,0.174464,0.171802,-0.556628,-0.112830,-0.106583,0.549787,0.029087,0.254279,-0.036693,-0.215754,-0.173873,0.162557,-0.305057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9444,-2.052801,-3.371932,2.059204,-3.739789,3.715892,-0.103684,0.082756,-0.013483,-0.139989,0.132338,-0.078751,-0.028160,0.428314,0.184362,-0.565549,0.161724,-0.966830,-0.078158,1.102119,-2.428606
9445,-0.687546,-1.437333,0.469151,-1.071023,0.565443,-0.153396,-0.047364,-0.138300,-0.133746,0.471333,0.087676,0.068830,-0.364784,0.133066,0.029833,-0.027050,-0.300316,-0.223075,-0.097410,-0.000540
9446,-0.892368,-1.759512,0.644882,-1.352215,0.477992,-0.115533,-0.230244,-0.387529,-0.269076,1.588013,0.377534,0.297807,-2.121838,0.010540,0.252684,-0.180520,-1.180866,-0.993585,-0.550750,-0.805724
9447,-5.820127,-6.289800,6.764660,-11.772134,12.542165,0.141643,3.004481,0.503178,-0.338018,2.257952,-0.432456,0.299126,4.984940,0.625584,-1.919030,0.814614,3.156314,3.104148,-0.917857,3.901049


In [28]:
pca.explained_variance_ratio_

array([0.00759061, 0.00597436, 0.00559223, 0.00553696, 0.00531186,
       0.00496896, 0.00475971, 0.00459946, 0.0045389 , 0.00438235,
       0.00435556, 0.00435299, 0.00426923, 0.00408657, 0.00406792,
       0.00396636, 0.00390044, 0.00386659, 0.00375386, 0.00364939])

In [29]:
# Sum the explained variance ratios
total_explained_variance = pca.explained_variance_ratio_.sum()
# Print the total explained variance
print(f"Total Explained Variance: {total_explained_variance}")

Total Explained Variance: 0.09352432095015713


In [None]:
# # Select only numeric columns for modeling
# numeric_features = df_cleaned.select_dtypes(include=['float64', 'int64']).columns

# # Create feature matrix X
# X = df_cleaned[numeric_features]

# # Optional: Create new features
# # Example: Combining features or creating ratios
# X['energy_valence_ratio'] = X['energy'] / X['valence']

In [None]:
# # Remove the problematic energy_valence_ratio column if it exists
# if 'energy_valence_ratio' in X.columns:
#     X = X.drop('energy_valence_ratio', axis=1)

# # Create the ratio feature with handling for zero values
# X['energy_valence_ratio'] = X['energy'] / X['valence'].replace(0, 0.0001)  # Replace zeros with small value

# # Now scale the features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
# # PCA for dimensionality reduction
# pca = PCA(n_components=0.95)  # Keep 95% of variance
# X_pca = pca.fit_transform(X_scaled)

# # Or t-SNE for non-linear dimensionality reduction
# tsne = TSNE(n_components=2, random_state=42)
# X_tsne = tsne.fit_transform(X_scaled)

In [None]:
# Using IQR method to detect outliers
def remove_outliers(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

# Apply outlier removal
X_no_outliers = remove_outliers(X_scaled, X_scaled.columns)

In [None]:
#unique_genres = set(df_cleaned['Artist Genres'].str.split(',').explode().value_counts())
unique_genres = df_cleaned['artist__genres'].str.split(',').explode().value_counts()
print(len(unique_genres))
print(unique_genres.head(20))

In [None]:
# Looking at the error message and available columns, we see that 'Album Genres' doesn't exist
# Let's use 'artist__genres' instead since we already have that data

# Get genre counts from the already exploded artist__genres
genre_counts = df_cleaned['artist__genres'].value_counts()

# Select top N genres (e.g., top 20)
top_n_genres = 20
top_genres = genre_counts.head(top_n_genres).index

# Create dummies only for top genres
genre_dummies = pd.get_dummies(
    df_cleaned['artist__genres'].where(df_cleaned['artist__genres'].isin(top_genres), 'other'),
    prefix='genre'
)

# Group by index and join with original dataframe
genre_dummies = genre_dummies.groupby(df_cleaned.index).sum()
df_with_top_genres = pd.concat([df_cleaned, genre_dummies], axis=1)

# No need to drop 'Album Genres' since it doesn't exist
# df_with_top_genres = df_with_top_genres.drop('Album Genres', axis=1)

print("\nShape with top genres only:", df_with_top_genres.shape)
print("\nTop genre columns:", genre_dummies.columns.tolist())

In [None]:
df_with_top_genres.info()

In [None]:
df_with_top_genres.head()

In [None]:
# Test the models
# Unsupervised models K-means, Gaussian 

In [None]:
# visualize model accuracy
# the elbow thing
# mushroom pizza
# 

In [None]:
# Build a GUI to display input/output
#
# 