# Spotify: Popularity Prediction

In [None]:
import pandas as pd

# Load the dataset

df = pd.read_csv('data/Spotify_Songs.csv')

## Preprocessing

### General Info

In [7]:
# General Info
print('Dataset Info:')
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  livene

### Irrelevant Features

In [8]:
# How do the textual attributes look like?
print('Textual Info:')
df.describe(include='object')

# Remove "artists", "album_name" and "track_name"  features (KEEP index, track_id)
df = df.drop(columns=['artists', 'album_name', 'track_name', 'index'])

# # Verify that irrelevant features are removed
# print('Dataset Info without Irrelevant Features:')
df.info()

Textual Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          114000 non-null  object 
 1   popularity        114000 non-null  int64  
 2   duration_ms       114000 non-null  int64  
 3   explicit          114000 non-null  bool   
 4   danceability      114000 non-null  float64
 5   energy            114000 non-null  float64
 6   key               114000 non-null  int64  
 7   loudness          114000 non-null  float64
 8   mode              114000 non-null  int64  
 9   speechiness       114000 non-null  float64
 10  acousticness      114000 non-null  float64
 11  instrumentalness  114000 non-null  float64
 12  liveness          114000 non-null  float64
 13  valence           114000 non-null  float64
 14  tempo             114000 non-null  float64
 15  time_signature    114000 non-null  int64  
 16  track_

### Missing Values 

In [9]:
# Remove rows with Missing Values
df = df.dropna()

# # Verify that missing values are removed
# print('Dataset Info without Missing Values:')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          114000 non-null  object 
 1   popularity        114000 non-null  int64  
 2   duration_ms       114000 non-null  int64  
 3   explicit          114000 non-null  bool   
 4   danceability      114000 non-null  float64
 5   energy            114000 non-null  float64
 6   key               114000 non-null  int64  
 7   loudness          114000 non-null  float64
 8   mode              114000 non-null  int64  
 9   speechiness       114000 non-null  float64
 10  acousticness      114000 non-null  float64
 11  instrumentalness  114000 non-null  float64
 12  liveness          114000 non-null  float64
 13  valence           114000 non-null  float64
 14  tempo             114000 non-null  float64
 15  time_signature    114000 non-null  int64  
 16  track_genre       11

### Genre Summarization

In [10]:
# keep original_track_genre
# add new superior_track_genre

# dictionary with key as keywords for super genre mapping
# 15 superior genres -> list of source tags
category_keywords = {
    "Pop": [
        "pop","power-pop","synth-pop","indie-pop","k-pop","j-pop","mandopop","cantopop","j-idol"
    ],
    "Rock": [
        "rock","rock-n-roll","rockabilly","alt-rock","alternative","indie","emo","grunge",
        "psych-rock","punk","punk-rock","hard-rock","goth","garage","j-rock"
    ],
    "Hip-Hop/Rap": [
        "hip-hop"
    ],
    "Electronic/Dance": [
        "electronic","edm","electro","dance","club","house","deep-house","progressive-house",
        "chicago-house","techno","detroit-techno","minimal-techno","trance","hardstyle",
        "drum-and-bass","breakbeat","dubstep","idm","j-dance","ambient","trip-hop","industrial"
    ],
    "Metal": [
        "metal","heavy-metal","black-metal","death-metal","metalcore","grindcore","hardcore"
    ],
    "Latin": [
        "latin","latino","reggaeton","salsa","samba","tango","pagode","forro","mpb","sertanejo"
    ],
    "R&B/Soul/Funk": [
        "r-n-b","soul","funk","gospel","groove"
    ],
    "Jazz/Blues": [
        "jazz","blues"
    ],
    "Country/Americana": [
        "country","honky-tonk","bluegrass"
    ],
    "Folk/Acoustic/Singer-Songwriter": [
        "folk","acoustic","singer-songwriter","songwriter","guitar"
    ],
    "Classical/Opera": [
        "classical","opera","piano"
    ],
    "Reggae/Ska/Dub": [
        "reggae","ska","dub","dancehall"
    ],
    "Soundtrack/Showtunes": [
        "show-tunes","disney","anime","pop-film"
    ],
    "World/International": [
        "world-music","brazil","french","german","spanish","swedish","turkish","malay",
        "iranian","indian","british"
    ],
    "Mood/Functional/Other": [
        "chill","happy","party","romance","sad","sleep","study","children","kids","new-age","comedy"
    ],
}

# Invert to tag -> super-genre map
super_genre_map = {
    tag: cat
    for cat, tags in category_keywords.items()
    for tag in tags
}


### Duplicate Values

In [11]:
# find duplicates in df[track_id]
print("Duplicates track_id", sum(df["track_id"].duplicated()))
print("Duplicates whole dataframe", sum(df.duplicated()))

print("Duplicates track_id & track_genre", sum(df[["track_id", "track_genre"]].duplicated()))

# to this for all combinations to show, that these are real duplicates and not (superficial duplicates as with the track_genre)
print("Duplicates track_id & track_genre & 3rd", sum(df[["track_id", "track_genre", "time_signature"]].duplicated()))

# --> 24259 duplicates in "track_id", 450 in whole dataset, combination of track_id and track_genre again 450
# --> Consequently, 450 true duplicates, rest alternations of track_genre

# find all superior genres for one track_id (a df with columns track_id and superior_track_genre)
print("Unique genres:", df["track_genre"].unique())
print("Number of unique genres: ", len(df["track_genre"].unique()))


Duplicates track_id 24259
Duplicates whole dataframe 450
Duplicates track_id & track_genre 450
Duplicates track_id & track_genre & 3rd 450
Unique genres: ['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive

In [12]:
# df with track_id and track_genre
df_track_id_genre = df[["track_id", "track_genre"]]

# df with track_id, track_genre and genre_super
# Apply (single-tag per row). For unknown tags fall back to World/International or Other.
df_super_long = df_track_id_genre
df_super_long["genre_super"] = df_track_id_genre["track_genre"].map(super_genre_map).fillna("World/International")

# df with track_id and genre_super
df_super_long = df_super_long.drop(columns=["track_genre"])
print(df_super_long.head(5))


                 track_id                      genre_super
0  5SuOikwiRyPMVoIQDJUgSV  Folk/Acoustic/Singer-Songwriter
1  4qPNDBW1i3p13qLCt0Ki3A  Folk/Acoustic/Singer-Songwriter
2  1iJBSr7s7jYXzM8EGcbK5b  Folk/Acoustic/Singer-Songwriter
3  6lfxq3CG4xtTiEg7opyCyx  Folk/Acoustic/Singer-Songwriter
4  5vjLSffimiIP26QG5WcN2K  Folk/Acoustic/Singer-Songwriter


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_super_long["genre_super"] = df_track_id_genre["track_genre"].map(super_genre_map).fillna("World/International")


We decided to not use the OneHotEncoder from the sklearn.preprocessing package since our data is multi-label. OneHotEncoder expects one label per category but we have multiple labels per category (aka track_id). So we would have to explode the genres and then aggregate anyway. Additionally our dataframe is static and we are not building a full ML Pipeline for automated retraining. Hence, we stick with the manual solution with pandas. Regardless the technique, we will have a dataframe with genres as columns and binary values indicating to which genres a song belongs.

In [13]:
# remove exact duplicates
df_super_long = df_super_long.drop_duplicates()

# create the one hot encoding for multi-label as a wide dataframe
df_wide = (
    df_super_long.assign(val=1)
    .pivot_table(index="track_id", columns="genre_super", values="val",
                 aggfunc="max", fill_value=0)
    .astype("int8")
)

df_wide.columns = [f"genre__{c.replace(' ', '_').replace('/', '_')}" for c in df_wide.columns]
df_wide.index.name = "track_id"  # ensure the index has the right name
df_wide = df_wide.reset_index()  # now it becomes a proper column

print(df_wide.head(5))

                 track_id  genre__Classical_Opera  genre__Country_Americana  \
0  0000vdREvCVMxbQTkS888c                       0                         0   
1  000CC8EParg64OmTxVnZ0p                       0                         0   
2  000Iz0K615UepwSJ5z2RE5                       0                         0   
3  000RDCYioLteXcutOjeweY                       0                         0   
4  000qpdoc97IMTBvF8gwcpy                       0                         0   

   genre__Electronic_Dance  genre__Folk_Acoustic_Singer-Songwriter  \
0                        0                                       0   
1                        1                                       0   
2                        1                                       0   
3                        0                                       0   
4                        1                                       0   

   genre__Hip-Hop_Rap  genre__Jazz_Blues  genre__Latin  genre__Metal  \
0                   0           

In [14]:
# join back to original dataframe df
base = df.drop(columns=["track_genre"]).copy()
base = base.drop_duplicates(subset=["track_id"])

# quick sanity check before merging
print("Length base:", len(base))
print("Length df_wide:", len(df_wide))

# merge with how=left to guarantee all base rows will be kept
df_final = base.merge(df_wide, on="track_id", how="left")

print("Length final dataset:", len(df_final))

Length base: 89741
Length df_wide: 89741
Length final dataset: 89741


In [15]:
# Quick Sanity checks
# 1) Uniqueness before merge (protects against row explosion)
assert base["track_id"].is_unique, "base has duplicate track_id"
assert df_wide["track_id"].is_unique, "df_wide has duplicate track_id"

# 2) No unexpected NaNs in the multi-hot columns after merge
genre_cols = [c for c in df_final.columns if c.startswith("genre__")]
na_counts = df_final[genre_cols].isna().sum().sum()
print("NaNs in genre columns:", na_counts)

# If any NaNs (shouldnâ€™t happen if pivot used fill_value=0), fix:
# df_final[genre_cols] = df_final[genre_cols].fillna(0).astype("int8")

# 3) All values are 0/1
bad_vals = set(df_final[genre_cols].stack().unique()) - {0, 1}
print("Non {0,1} values present?:", bool(bad_vals), bad_vals)

# 4) No track has all-zero vector (would indicate unmapped genre)
all_zero_tracks = (df_final[genre_cols].sum(axis=1) == 0).sum()
print("Tracks with no super-genre assigned:", all_zero_tracks)

df_final["amount_genres"] = df_final[genre_cols].sum(axis=1)
print("Maximum of genres a song belonged:", max(df_final["amount_genres"]))

# 5) Basic distribution & sparsity (quick glance)
col_sums = df_final[genre_cols].sum().sort_values(ascending=False)
print(col_sums.head(10))

# 6) Optional: memory/dtype tidy-up
print(df_final[genre_cols].dtypes.unique())
# If needed:
# df_final[genre_cols] = df_final[genre_cols].astype("int8")


NaNs in genre columns: 0
Non {0,1} values present?: False set()
Tracks with no super-genre assigned: 0
Maximum of genres a song belonged: 6
genre__Electronic_Dance         19123
genre__World_International      12894
genre__Rock                     12011
genre__Mood_Functional_Other    10554
genre__Pop                       8742
genre__Latin                     8022
genre__Metal                     6572
genre__R&B_Soul_Funk             4700
genre__Reggae_Ska_Dub            3954
genre__Soundtrack_Showtunes      3916
dtype: int64
[dtype('int8')]


In [16]:
# set df_final to df
df = df_final

### Outliers

In [17]:
# Choose rows with a duration of <1min & >10min, a time signature and tempo of 0
drop_clause = (df['duration_ms'] < 60000) | (df['duration_ms'] > 600000) | (df['time_signature'] == 0) | (df['tempo'] == 0)

# Find the index of that condition in our dataset
drop_index = df[drop_clause].index

# Drop those rows
df = df.drop(drop_index)


# Verify that outliers are removed
print('Dataset without Outliers:')
df.info()

Dataset without Outliers:
<class 'pandas.core.frame.DataFrame'>
Index: 88266 entries, 0 to 89740
Data columns (total 32 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   track_id                                88266 non-null  object 
 1   popularity                              88266 non-null  int64  
 2   duration_ms                             88266 non-null  int64  
 3   explicit                                88266 non-null  bool   
 4   danceability                            88266 non-null  float64
 5   energy                                  88266 non-null  float64
 6   key                                     88266 non-null  int64  
 7   loudness                                88266 non-null  float64
 8   mode                                    88266 non-null  int64  
 9   speechiness                             88266 non-null  float64
 10  acousticness                         