### Collect data from Spotify API using spotipy library

In [1]:
%load_ext autoreload

In [2]:
from cap_package import SpotipyCollect as sc
from dotenv import load_dotenv
import numpy as np
import os
from pathlib import Path
%autoreload 2

Downloading emoji data ...
... OK (Got response in 0.18 seconds)
Writing emoji data to C:\Users\Administrator\.demoji\codes.json ...
... OK


In [3]:
load_dotenv()
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
REDIRECT_URI = os.getenv('REDIRECT_URI')
USERNAME = os.getenv('SPOTIFY_USERNAME')
SCOPE = 'playlist-read-private'
PATH = os.getenv('PATH_DATASET1.2')

Hierachy:
- spotipy_userauth
- create_dataset > arg(get_folder_analysis)
                 > get_playlist_analysis 
                 > get_segments > arg(extract_tracks_analysis) > arg(extract_tracks)

  - get_segments > arg(track_analysis), track_analysis_to_df, convert_time

     - track_analysis_to_df > arg(track_analysis) or > extract_track_analysis(arg(track_id))

  - extract_track_analysis > arg(tracksid), spotipy.audio_analysis
  - extract_tracks > arg(playlist_id), spotipy.playlist_tracks
   
- Using USER's playlist: get_pl_details >  playlist_id_url > arg(extract_playlists,)

   
Reduntant - tracks_analysis, track_genre

In [4]:
# get spotify authorization 
sp = sc.spotipy_userauth(USERNAME, SCOPE, CLIENT_ID, CLIENT_SECRET, REDIRECT_URI)

In [5]:
# get user playlist details - specifically : name, id, url and total number of tracks
pl_Name_, pl_ID_, pl_URL_, pltot_Tracks_ = sc.get_pl_details(sp, USERNAME)
print('Sample playlist names:', pl_Name_[:3])

Sample playlist names: ["Today's Top Hits", 'Deep house', 'Progressive House']


In [6]:
# Filter, sort and choose relevant playlists
filsort_pl = sc.filtersort_playlists(pl_Name_, pl_ID_, pl_URL_, pltot_Tracks_, start=1, pl_range = 18)[2:-2]
print("Let's see a sample:\n")
filsort_pl[0]

Let's see a sample:



(50,
 'That familiar trance',
 '3PH2J5HkKhhMoxWj3W0jk8',
 'https://api.spotify.com/v1/playlists/3PH2J5HkKhhMoxWj3W0jk8/tracks')

In [7]:
# Get audio analysis of all playlists
folder_analysis_dict = sc.get_folder_analysis(sp, filsort_pl)

Track names are present as keys for a playlist. They are a combination of track name and the first 3 characters of the contributing artist/s name.

In [8]:
folder_analysis_dict['Progressive 1'].keys()

dict_keys(['London_Kat', 'Burns - Lane 8 Club Mix_Geo', 'Boxed Out_Cub', 'Sunday Maybe_Way', 'Tokyo Night Train - RMX_Cla', 'Kids - RMX_PRO', 'Chanunpa_Sou', 'Breathe_Vin', 'Ascension_Vin', 'Come Home - Mixed_Vin', '8 Bit Eclipse_Qui', 'Iridescent - Forerunners Dub Remix_Sou', "Cristiano - Peter Illias 'Back to Love' Remix_Tim", 'Told You_Cla', 'Campfire 2017 - Sons of Maria Retouch_Din', 'Mimi_Esk', 'Kiwi_Har'])

For a track we have two 2 dataframes: 
- Tempo : constitnig of a single value
- Segments : consisting of 100 rows for 100 segments chosen based on certain criteria.

Below displayed is a segments df

In [9]:
folder_analysis_dict['Progressive 1']['London_Kat'][1]

Unnamed: 0,start,start_minute,duration,confidence,pitches,timbre
3,0.54567,00:00:55,0.37737,1.000,"[0.099, 0.135, 0.145, 0.212, 0.289, 0.555, 0.8...","[44.69, 72.975, -59.469, 39.529, 15.246, -114...."
5,1.02172,00:01:02,0.51043,0.870,"[0.11, 0.144, 0.139, 0.197, 0.268, 0.539, 0.83...","[43.2, 74.587, -21.919, 47.153, 28.937, -46.45..."
6,1.53215,00:01:53,0.35451,0.986,"[0.743, 1.0, 0.319, 0.137, 0.083, 0.043, 0.033...","[42.67, 29.638, -49.407, 39.926, -16.55, -139...."
10,2.51320,00:02:51,0.36576,1.000,"[0.711, 1.0, 0.322, 0.136, 0.084, 0.048, 0.031...","[41.307, 13.913, -57.769, 67.452, -24.724, -13..."
26,4.97488,00:04:97,0.34834,1.000,"[0.717, 1.0, 0.322, 0.134, 0.077, 0.035, 0.027...","[39.028, -28.523, -92.945, 109.523, -32.092, -..."
...,...,...,...,...,...,...
1619,285.53516,04:45:54,0.25610,0.968,"[1.0, 0.187, 0.062, 0.066, 0.241, 0.781, 0.125...","[39.551, 179.46, -10.525, -0.761, 101.805, -17..."
1648,290.70804,04:50:71,0.25034,1.000,"[0.694, 1.0, 0.482, 0.195, 0.155, 0.083, 0.046...","[39.112, -35.085, -152.116, 138.637, -28.081, ..."
1652,291.44482,04:51:44,0.25002,0.991,"[1.0, 0.291, 0.119, 0.119, 0.219, 0.61, 0.161,...","[32.912, 313.575, -71.716, 17.621, 50.437, -41..."
1680,297.84244,04:57:84,0.34844,0.707,"[0.373, 0.411, 0.426, 0.518, 0.469, 0.588, 0.5...","[31.054, 331.479, 38.327, 84.965, 88.189, -35...."


In [10]:
# Confirm if there are any tracks with less than 100 segments
mis_segs = []
for i,j in folder_analysis_dict.items():
    
    for k,l in j.items():
        if len(l[1])<100:
            mis_segs.append((i,k, len(l[1])))
# Should be empty
mis_segs

[]

In [11]:
# Define path to the dataset directory
path = Path(PATH)

In [12]:
# Create dataset and save them as parquet files
sc.create_dataset(folder_analysis_dict, path)