# Data Collection

One goal of this project is to work with the Spotify API and collect data on my own playlists.

In [3]:
# import important libraries
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import math
import time
import pandas as pd

Safely load credentials to access the API so I have a connection.

In [2]:
load_dotenv()

True

In [3]:
sp = spotipy.Spotify(auth_manager = SpotifyOAuth(
    client_id=os.getenv("SPOTIFY_CLIENT_ID"),
    client_secret=os.getenv("SPOTIFY_SECRET_ID"),
    redirect_uri=os.getenv("SPOTIFY_REDIRECT_URI"),
    scope="playlist-read-private playlist-read-collaborative",
    cache_path=".spotify_cache"
))

Gather some data regarding my playlists. 

In [4]:
all_playlists = sp.current_user_playlists()
all_playlists_items = all_playlists['items']

Define a function to grab tracks from one playlist.

In [5]:
def get_all_tracks(playlist_id, playlist_name, playlist_owner=None):

    # define empty list to store data in
    all_tracks = []

    # initialize max tracks i can grab per call
    batch_size = 100

    # control how often progress is printed
    progress_interval = 100

    # connect to api and gather total number of tracks so i know how many loops to do
    try:
        playlist_data = sp.playlist_tracks(playlist_id, limit=1)
        total_tracks = playlist_data['total']
    except spotipy.SpotifyException as e:
        print(f"Failed to retrieve playlist metadata for '{playlist_name}': {e}")
        return []

    # calculate number of batches to do since each batch receives up to 100 tracks
    num_batches = math.ceil(total_tracks / batch_size)

    # loop through batches
    for batch in range(num_batches):
        
        # tells which track the previous batch left off on
        offset = batch * batch_size

        # fetch tracks starting at offset 
        try:
            track_page = sp.playlist_tracks(playlist_id, limit = batch_size, offset=offset)
        except spotipy.SpotifyException as e:
            print(f"Error retrieving batch {batch + 1} of playlist '{playlist_name}': {e}")
            continue

        # iterates through each track in the batch 
        for item in track_page['items']:

            # ensure loop does not stop if track is not available
            track = item.get('track')
            if not track or not track.get('id'):
                continue

            track_info = {
                'playlist_id': playlist_id,
                'playlist': playlist_name,
                'playlist_owner': playlist_owner,
                'track_id': track['id'],
                'track_name': track['name'],
                'track_duration': track['duration_ms'],
                'explicit': track['explicit'],
                'artist': [artist['name'] for artist in track['artists']],
                'album': track['album']['name'],
                'added_at': item['added_at']
            }
            all_tracks.append(track_info)

        # shows progress of number of songs
        processed = (batch + 1) * batch_size
        if processed % progress_interval == 0 or batch == num_batches - 1:
            print(f"[{playlist_name}] Processed {min(processed, total_tracks)} / {total_tracks} tracks")

        time.sleep(0.5)

    return all_tracks

In [6]:
total_playlists = all_playlists['total']
total_playlists

14

In [7]:
tracks_from_all_playlists = []

In [8]:
# use function now to loop over all my playlists
for playlist in all_playlists_items:
    playlist_id = playlist['id']
    playlist_name = playlist['name']
    playlist_owner = playlist['owner']['id']
    
    print(f"Fetching Plaliyst: {playlist_name}")
    tracks_from_all_playlists.extend( get_all_tracks(playlist_id, playlist_name, playlist_owner) )
    
    time.sleep(0.5)

    print(f"Moving onto Next Playlist...\n")
    
print(f"\nSuccess!!! All {len(tracks_from_all_playlists)} Tracks Have Been Extracted.")

Fetching Plaliyst: CHOW MEIN CHICKEN FRIED RICE EGG ROLL BANGERS ONLYYYYYY
[CHOW MEIN CHICKEN FRIED RICE EGG ROLL BANGERS ONLYYYYYY] Processed 23 / 23 tracks
Moving onto Next Playlist...

Fetching Plaliyst: Eeee deeee emmmmm
[Eeee deeee emmmmm] Processed 12 / 12 tracks
Moving onto Next Playlist...

Fetching Plaliyst: warm soufflé pancakes after yakiniku on a cold winter day
[warm soufflé pancakes after yakiniku on a cold winter day] Processed 100 / 155 tracks
[warm soufflé pancakes after yakiniku on a cold winter day] Processed 155 / 155 tracks
Moving onto Next Playlist...

Fetching Plaliyst: a ham sandwich, apple slices, and a caprisun at lunch time
[a ham sandwich, apple slices, and a caprisun at lunch time] Processed 100 / 162 tracks
[a ham sandwich, apple slices, and a caprisun at lunch time] Processed 162 / 162 tracks
Moving onto Next Playlist...

Fetching Plaliyst: chicken wraps in east los angeles
[chicken wraps in east los angeles] Processed 64 / 64 tracks
Moving onto Next Play

In [9]:
# get playlists only created by me
all_tracks = [tracks for tracks in tracks_from_all_playlists if tracks['playlist_owner'] == 'chrishui']

In [10]:
len(all_tracks)

901

In [12]:
all_tracks = pd.DataFrame(all_tracks)

Rename tracks for easier analysis.

In [13]:
all_tracks['playlist'].unique()

array(['CHOW MEIN CHICKEN FRIED RICE EGG ROLL BANGERS ONLYYYYYY',
       'Eeee deeee emmmmm',
       'warm soufflé pancakes after yakiniku on a cold winter day',
       'a ham sandwich, apple slices, and a caprisun at lunch time',
       'chicken wraps in east los angeles',
       'tv dinners while playing neopets and lizzie mcguirre in the background',
       'cinammon sticks, freshly baked cookies, and nutmeg with your nose burning from the cold',
       'creamy carbonara buldak ramen',
       'baked pork chop with tomato sauce and rice at the hk diner ',
       'beef and broccoli, orange chicken, and burrito bowls at the university dorms',
       "succulent chinese meals served by the rudest 4'11 chinese woman imagineable"],
      dtype=object)

In [14]:
di = {'CHOW MEIN CHICKEN FRIED RICE EGG ROLL BANGERS ONLYYYYYY' : 'OldChinese',
      'Eeee deeee emmmmm': 'EDM',
      'warm soufflé pancakes after yakiniku on a cold winter day': 'Jpop',
      'a ham sandwich, apple slices, and a caprisun at lunch time': 'Pop',
      'chicken wraps in east los angeles': 'Rap',
      'tv dinners while playing neopets and lizzie mcguirre in the background': 'OldPop',
      'cinammon sticks, freshly baked cookies, and nutmeg with your nose burning from the cold': 'Christmas',
      'creamy carbonara buldak ramen': 'Kpop',
      'baked pork chop with tomato sauce and rice at the hk diner ': 'Cpop',
      'beef and broccoli, orange chicken, and burrito bowls at the university dorms': 'Uni',
      'succulent chinese meals served by the rudest 4\'11 chinese woman imagineable': 'Chinese'}

In [15]:
all_tracks['playlist'] = all_tracks['playlist'].map(di)

**Please note I initially intended to use the Spotify API to pull all audio features for all tracks in my playlist. After a few failed attempts, I searched online for suggestions only to come across this [Spotify Web API update](https://developer.spotify.com/blog/2024-11-27-changes-to-the-web-api). Unfortunately, this means my original plan of using the `sp.audio_features()` endpoint would always fail, since it was deprecated. Solutions include using Reccobeats or SoundNet APIs, both which require downloading the songs (which may be illegal) or downloading the spotify 30-second song previews (which access through the API was also deprecated). The solution to this, unfortunately, was to use older datasets that include these audio features, perform an inner join, and hope the sample size is large enough.**

The most comprehensive dataset is found [here](https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset?select=dataset.csv) and was used.

In [17]:
tracks_sample = pd.read_csv("dataset.csv")

In [18]:
tracks_sample.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [19]:
tracks_sample = tracks_sample.iloc[:, 1:]

In [20]:
available_tracks = all_tracks.merge(tracks_sample, how = 'inner', on = 'track_id')

In [None]:
available_tracks.shape

**The resulting dataset gives 431 songs instead of the 901 I planned to initially work with. Although not ideal, this dataset is still rich and diverse enough to work with (hopefully).**

In [21]:
available_tracks.to_csv("Data.csv", index = False)