### Collect data from Spotify API using spotipy library

In [1]:
%load_ext autoreload

In [2]:
from cap_package import SpotipyCollect as sc
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pandas import json_normalize
from pathlib import Path
import seaborn as sns
%autoreload 2

Downloading emoji data ...
... OK (Got response in 0.39 seconds)
Writing emoji data to C:\Users\Administrator\.demoji\codes.json ...
... OK


In [3]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [4]:
load_dotenv()
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
REDIRECT_URI = os.getenv('REDIRECT_URI')
USERNAME = os.getenv('SPOTIFY_USERNAME')
SCOPE = 'playlist-read-private'
PATH = os.getenv('PATH_DATASET1.2')

Hierachy:
- spotipy_userauth
- create_dataset > arg(get_folder_analysis)
                 > get_playlist_analysis 
                 > get_segments > arg(extract_tracks_analysis) > arg(extract_tracks)

  - get_segments > arg(track_analysis), track_analysis_to_df, convert_time

     - track_analysis_to_df > arg(track_analysis) or > extract_track_analysis(arg(track_id))

  - extract_track_analysis > arg(tracksid), spotipy.audio_analysis
  - extract_tracks > arg(playlist_id), spotipy.playlist_tracks
   
- Using USER's playlist: get_pl_details >  playlist_id_url > arg(extract_playlists,)

   
Reduntant - tracks_analysis, track_genre

In [5]:
# get spotify authorization 
sp = sc.spotipy_userauth(USERNAME, SCOPE, CLIENT_ID, CLIENT_SECRET, REDIRECT_URI)
# set path
path = Path(PATH)

In [6]:
# get user playlist details - specifically : name, id, url and total number of tracks
pl_Name_, pl_ID_, pl_URL_, pltot_Tracks_ = sc.get_pl_details(sp, USERNAME)
print('Sample playlist names:', pl_Name_[:3])

Sample playlist names: ["Today's Top Hits", 'Deep house', 'Progressive House']


In [7]:
# Filter, sort and choose relevant playlists
filsort_pl = sc.filtersort_playlists(pl_Name_, pl_ID_, pl_URL_, pltot_Tracks_, start=1, pl_range = 18)[2:-2]
print("Let's see a sample:\n")
filsort_pl

Let's see a sample:



[(50,
  'That familiar trance',
  '3PH2J5HkKhhMoxWj3W0jk8',
  'https://api.spotify.com/v1/playlists/3PH2J5HkKhhMoxWj3W0jk8/tracks'),
 (49,
  'House-Trance',
  '2RnR5cw9kJUc9onu4WSRrW',
  'https://api.spotify.com/v1/playlists/2RnR5cw9kJUc9onu4WSRrW/tracks'),
 (36,
  'Our old school trance',
  '7HUclibRF0h57pVvXr3g9v',
  'https://api.spotify.com/v1/playlists/7HUclibRF0h57pVvXr3g9v/tracks'),
 (35,
  'Classic progressive',
  '7k5wrlGBdrXCtAQ1vNGC7J',
  'https://api.spotify.com/v1/playlists/7k5wrlGBdrXCtAQ1vNGC7J/tracks'),
 (32,
  'Progressive 4',
  '53kl8WegufR0IqMgyklXEL',
  'https://api.spotify.com/v1/playlists/53kl8WegufR0IqMgyklXEL/tracks'),
 (27,
  'Deep house',
  '6aQz1t74RjRrUAK6NcPGcf',
  'https://api.spotify.com/v1/playlists/6aQz1t74RjRrUAK6NcPGcf/tracks'),
 (26,
  'Progressive 2',
  '2IlSKDObaH9jwLJQcqiv2Y',
  'https://api.spotify.com/v1/playlists/2IlSKDObaH9jwLJQcqiv2Y/tracks'),
 (23,
  'Our old school trance 2',
  '43sMbiw98RFTH2WjmbTidr',
  'https://api.spotify.com/v1/playlist

In [8]:
feat_dic = sc.get_folder_features(sp, filsort_pl)

In [27]:
feat_dic['Classic progressive'].key

0     11
1     11
2      6
3      2
4      5
5      7
6      9
7      1
8      5
9      5
10     2
11     7
12     9
13     4
14     9
15     2
16     0
17     0
18     0
19     7
20     1
21     9
22     1
23     6
24     5
25     6
26     0
27     1
28     3
29    11
30    11
31     7
32    10
33     3
34    10
Name: key, dtype: int64

In [25]:
path_ = path.joinpath('user_pl_feat')
nulls =[]
folder_transform = {}
for pl, feat_df in feat_dic.items():
    
    # check for null values
    if feat_df.isnull().sum().sum() > 0:
        # record playlist name and row indices of dataframe with missing values
        nulls.append((pl, feat_df[feat_df.isnull().any(axis=1)].index))
        print('Found missing values')

    # initiate transformer    
    column_trans = make_column_transformer(
                        (MinMaxScaler(),['danceability', 'energy', 'loudness', 'speechiness', 
                                'acousticness', 'instrumentalness', 'valence', 'tempo']),
                        (OneHotEncoder(categories=[[0,1,2,3,4,5,6,7,8,9,10,11]]), ['key']),
                         remainder='passthrough')

    # Define column names in order they are transformed
    col_names =  ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                  'instrumentalness', 'valence', 'tempo'] + ['key_{}'.format(i) for i in range(12)]
    df_col_name = [x for x in list(feat_df.columns) if x not in col_names+['key']]

    col_names = col_names + df_col_name
    # fit, transform and convert to a dataframe
    std_feat_df = pd.DataFrame(column_trans.fit_transform(feat_df), columns=col_names)
    
    folder_transform[pl] = std_feat_df
    # save dataframe as parquet
    #std_feat_df.to_parquet(path_.joinpath('{}_features.parquet'.format(pl)), engine='pyarrow')


In [26]:
folder_transform

{'That familiar trance':    danceability      energy   loudness  speechiness acousticness  \
 0       0.47619    0.525988   0.146379    0.0769851     0.797351   
 1      0.624542     0.81289   0.228043    0.0374849     0.465007   
 2      0.369963    0.923077    0.41094    0.0729545  4.45828e-05   
 3      0.802198    0.775468   0.519569     0.415558     0.465007   
 4      0.412088    0.629938   0.374422    0.0552197     0.169139   
 5      0.377289    0.825364   0.478274    0.0137042    0.0349448   
 6      0.459707    0.923077   0.566564    0.0116888     0.266816   
 7       0.47619    0.864865   0.558552     0.154373    0.0101406   
 8      0.531136    0.370062   0.252851     0.044337      0.49743   
 9       0.71978    0.841996   0.616179    0.0644901    0.0278521   
 10     0.626374   0.0893971   0.311248    0.0209593   0.00228993   
 11     0.644689           0  0.0620955     0.118904     0.137526   
 12     0.650183    0.706861   0.619569    0.0580411     0.107939   
 13     0.

In [None]:
cov_mat = np.corrcoef(std_feat_df.iloc[:,12:20].astype('float32'), rowvar=False)

In [None]:
plt.figure(figsize=(8,8))
sns.set(font_scale=1.2)
cols = list(std_feat_df.iloc[:,12:20].columns)
hm = sns.heatmap(cov_mat,
                 cbar=True,
                 annot=True,
                 square=True,
                 fmt='.2f',
                 annot_kws={'size': 12},
                 cmap='coolwarm',                 
                 yticklabels=cols,
                 xticklabels=cols)
plt.title('Covariance matrix showing correlation coefficients', size = 15)
plt.tight_layout()
plt.show()