## Spotify Genre encocoding

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os
import pandas as pd
import time
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import ast

load_dotenv()
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [2]:
tracks_df = pd.read_csv('../data/4_spotify_million_tracks.csv', on_bad_lines='skip')
tracks_df.head()

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,album_cover,genres
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008-03-14,14,150040,False,https://i.scdn.co/image/ab67616d0000b2739e6b95...,"['chanson', 'french pop', 'french rock', 'nouv..."
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004-03-21,1,253000,False,https://i.scdn.co/image/ab67616d0000b27398d445...,"['chanson', 'french pop']"
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011-02-01,3,240400,False,https://i.scdn.co/image/ab67616d0000b27353a906...,['medieval']
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007-06-12,1,138760,True,https://i.scdn.co/image/ab67616d0000b273e6d949...,"['canadian metal', 'canadian post-hardcore', '..."
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022-10-07,0,199986,False,https://i.scdn.co/image/ab67616d0000b27349ea4d...,['pops orchestra']


In [3]:
tracks_df['genres']

0       ['chanson', 'french pop', 'french rock', 'nouv...
1                               ['chanson', 'french pop']
2                                            ['medieval']
3       ['canadian metal', 'canadian post-hardcore', '...
4                                      ['pops orchestra']
                              ...                        
9206                                                  NaN
9207             ['irish rock', 'permanent wave', 'rock']
9208                                                  NaN
9209    ['manguebeat', 'nova mpb', 'nova musica pernam...
9210                                                  NaN
Name: genres, Length: 9211, dtype: object

In [4]:
tracks_df['genres'] = tracks_df['genres'].fillna('[]')
# Then convert string to lists
tracks_df['genres'] = tracks_df['genres'].apply(lambda x: ast.literal_eval(str(x)) if pd.notna(x) else [])

# Create MultiLabelBinarizer instance
mlb = MultiLabelBinarizer()

#  one-hot encoded columns
genres_encoded = pd.DataFrame(
    mlb.fit_transform(tracks_df['genres']),
    columns=mlb.classes_,
    index=tracks_df.index
)

# Add encoded genres to original dataframe
tracks_df = pd.concat([tracks_df, genres_encoded], axis=1)

# Display information 
print("Number of unique genres:", len(genres_encoded.columns))
print("\nFirst few genre columns:", list(genres_encoded.columns[:10]))
print("\nSample of encoded data:")
print(genres_encoded.head())

Number of unique genres: 1693

First few genre columns: ['21st century classical', 'abstract', 'abstract hip hop', 'accordeon', 'accordion', 'acid house', 'acid jazz', 'acid rock', 'acid trance', 'acousmatic']

Sample of encoded data:
   21st century classical  abstract  abstract hip hop  accordeon  accordion  \
0                       0         0                 0          0          0   
1                       0         0                 0          0          0   
2                       0         0                 0          0          0   
3                       0         0                 0          0          0   
4                       0         0                 0          0          0   

   acid house  acid jazz  acid rock  acid trance  acousmatic  ...  yacht rock  \
0           0          0          0            0           0  ...           0   
1           0          0          0            0           0  ...           0   
2           0          0          0            

In [5]:
##check genres
# # list of all genre columns 
genre_columns = genres_encoded.columns

# Show songs that belong to each genre (value = 1)
for genre in genre_columns:
    # Count how many songs are in each genre
    songs_in_genre = genres_encoded[genre].sum()
    print(f"{genre}: {songs_in_genre} songs")


21st century classical: 3 songs
abstract: 1 songs
abstract hip hop: 27 songs
accordeon: 5 songs
accordion: 14 songs
acid house: 12 songs
acid jazz: 4 songs
acid rock: 10 songs
acid trance: 1 songs
acousmatic: 10 songs
acoustic blues: 136 songs
acoustic pop: 9 songs
acoustic rock: 1 songs
adult standards: 78 songs
african-american classical: 1 songs
afro-cuban percussion: 26 songs
afrobeat: 1 songs
afrobeat fusion: 1 songs
afrobeats: 1 songs
afropop: 19 songs
aggrotech: 8 songs
alaska indie: 1 songs
alberta country: 1 songs
album rock: 200 songs
alternative americana: 6 songs
alternative country: 19 songs
alternative dance: 31 songs
alternative hip hop: 60 songs
alternative metal: 152 songs
alternative metalcore: 6 songs
alternative pop: 1 songs
alternative r&b: 2 songs
alternative rock: 275 songs
alternative roots rock: 12 songs
ambient: 1 songs
ambient black metal: 1 songs
ambient dub: 5 songs
ambient idm: 3 songs
ambient psychill: 8 songs
ambient worship: 3 songs
american choir: 2 so

In [6]:

#  see specific songs in a particular genre
# example, to see songs in a specific genre (let's say 'rock' if it exists):
print("\nExample of songs in a specific genre:")
genre_to_check = genre_columns[0]  # Using first genre as example
songs_in_genre = tracks_df[tracks_df[genre_to_check] == 1][['original_title', 'original_artist', genre_to_check]]
print(f"\nSongs in '{genre_to_check}' genre:")
print(songs_in_genre.head())



Example of songs in a specific genre:

Songs in '21st century classical' genre:
                   original_title original_artist  21st century classical
3859  Acension Final Chord Rising     Terry Riley                       1
7134                  peace dance     terry riley                       1
8025                  peace dance     terry riley                       1


In [7]:

# To see songs with multiple genres
print("\nNumber of genres per song:")
genre_counts = genres_encoded.sum(axis=1)
print(genre_counts.value_counts().sort_index())



Number of genres per song:
0     1465
1     1940
2     1835
3     1494
4      890
5      650
6      397
7      181
8      217
9       70
10      32
11      36
12       1
13       3
Name: count, dtype: int64


# Date adjustment

In [8]:
print("Sample of release_dates:")
print(tracks_df['release_date'].head(10))

Sample of release_dates:
0    2008-03-14
1    2004-03-21
2    2011-02-01
3    2007-06-12
4    2022-10-07
5    1992-09-25
6    2024-09-03
7    1986-11-15
8    1998-07-20
9    2004-06-28
Name: release_date, dtype: object


In [9]:
def extract_year(date_str):
    if pd.isna(date_str):
        return None
    # If it's just a year (4 digits)
    if len(str(date_str)) == 4 and str(date_str).isdigit():
        return int(date_str)
    try:
        # Try to parse as full date
        return pd.to_datetime(date_str).year
    except:
        try:
            # If it fails, try just extracting the first 4 digits if they're numbers
            year = str(date_str)[:4]
            return int(year) if year.isdigit() else None
        except:
            return None

In [10]:
# Apply the conversion
tracks_df['release_year'] = tracks_df['release_date'].apply(extract_year)

# Remove the original 'release_date' column if not needed
tracks_df = tracks_df.drop('release_date', axis=1)

# Let's see the distribution of years
print("\nYear distribution:")
print(tracks_df['release_year'].value_counts().sort_index().head())


Year distribution:
release_year
1898     1
1900    11
1926     1
1927     1
1928     6
Name: count, dtype: int64


In [11]:
tracks_df

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,popularity,duration_ms,explicit,album_cover,genres,...,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco,release_year
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,14,150040,False,https://i.scdn.co/image/ab67616d0000b2739e6b95...,"[chanson, french pop, french rock, nouvelle ch...",...,0,0,0,0,0,0,0,0,0,2008
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,1,253000,False,https://i.scdn.co/image/ab67616d0000b27398d445...,"[chanson, french pop]",...,0,0,0,0,0,0,0,0,0,2004
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,3,240400,False,https://i.scdn.co/image/ab67616d0000b27353a906...,[medieval],...,0,0,0,0,0,0,0,0,0,2011
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,1,138760,True,https://i.scdn.co/image/ab67616d0000b273e6d949...,"[canadian metal, canadian post-hardcore, kings...",...,0,0,0,0,0,0,0,0,0,2007
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",0,199986,False,https://i.scdn.co/image/ab67616d0000b27349ea4d...,[pops orchestra],...,0,0,0,0,0,0,0,0,0,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9206,one about heaven,brent lamb,One About Heaven,Brent Lamb,Reflections Of A Simple Man,0,237160,False,https://i.scdn.co/image/ab67616d0000b273645e58...,[],...,0,0,0,0,0,0,0,0,0,2005
9207,october,u2,October - Remastered 2008,U2,October (Deluxe Edition Remastered),34,141040,False,https://i.scdn.co/image/ab67616d0000b2731a2cc6...,"[irish rock, permanent wave, rock]",...,0,0,0,0,0,0,0,0,0,1981
9208,comin home,zo2,Comin' Home,ZO2,Ain't It Beautiful,0,346520,False,https://i.scdn.co/image/ab67616d0000b273e309c4...,[],...,0,0,0,0,0,0,0,0,0,2007
9209,pode me chamar,eddie,Pode Me Chamar,Banda Eddie,Original Olinda Style,36,238600,False,https://i.scdn.co/image/ab67616d0000b27370f301...,"[manguebeat, nova mpb, nova musica pernambucana]",...,0,0,0,0,0,0,0,0,0,2003


In [12]:
tracks_df.to_csv('../data/5_spotify_tracks_encoded.csv', index=False)

## Just numerical dataset


In [13]:
# Get only numeric columns
numeric_columns = tracks_df.select_dtypes(include=['int64','bool', 'float64']).columns
numeric_df = tracks_df[numeric_columns]
numeric_df = numeric_df.astype(int)
numeric_df 

Unnamed: 0,popularity,duration_ms,explicit,21st century classical,abstract,abstract hip hop,accordeon,accordion,acid house,acid jazz,...,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco,release_year
0,14,150040,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2008
1,1,253000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2004
2,3,240400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2011
3,1,138760,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2007
4,0,199986,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9206,0,237160,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2005
9207,34,141040,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1981
9208,0,346520,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2007
9209,36,238600,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2003


In [14]:
numeric_df.to_csv('../data/6_spotify_tracks_numeric.csv', index=False)