///-------------------------------------------------------------------------------------------------<br>
// File: Dataset Preparation.ipynb<br>
//<br>
// Author: Dakshvir Singh Rehill<br>
// Date: 14/10/2020<br>
//<br>
// Summary:	This notebook is used to generate the dataset from DJMag Top 100 DJs and Spotify API<br>
///-------------------------------------------------------------------------------------------------
***

## Get Top 100 DJs from DJMag
***
1. Import requests package
2. Use requests to fetch Top 100 page from DJMag
3. Import beautifulsoup package
4. Use beautifulsoup to get DJ name
***

In [1]:
import requests
from bs4 import BeautifulSoup
mScrapeURL = 'https://djmag.com/top100dj?year=2019'
mPageHTML = requests.get(mScrapeURL)
mParsedObject = BeautifulSoup(mPageHTML.content, 'html.parser')
mDJNameElemList = mParsedObject.find_all('div',class_='top100dj-name')
mDJNamesList = []
for aDJNameElem in mDJNameElemList:
    mDJNamesList.append(aDJNameElem.find('a').text)

## Get DJ Popularity, ID from Spotify API
***
1. Import spotipy package
2. Use spotipy to set up App Credentials
3. Search for each artist
5. Import pandas package
4. Create DataFrame with Artist Details
***

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as mNP
import pandas as mPandasObj
from IPython.display import display
mSpotifyClient = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7f839484fcf4450db86dcc9126cec5e1",\
                                client_secret="80e9b5a936a44b2880354a719ef54019", requests_timeout = None))

In [3]:
mArtistList = []
for aArtistName in mDJNamesList:
    aSelectedArtist = {'id' : '','name' : '','popularity' : 0}
    aSearchResult = mSpotifyClient.search(q=aArtistName,type='artist',market='CA')
    for aArtist in aSearchResult['artists']['items']:
        if aSelectedArtist['popularity'] <= aArtist['popularity']:
            aSelectedArtist['id'] = aArtist['id']
            aSelectedArtist['name'] = aArtist['name']
            aSelectedArtist['popularity'] = aArtist['popularity']
    mArtistList.append(aSelectedArtist)
mArtistDataset = mPandasObj.DataFrame(mArtistList)
mArtistDataset.shape

(100, 3)

## Get Top 150 songs of each Artist
***
1. Search for type track using artist name
2. Get ID, Name, Popularity of Track and concatenate into Artist DataFrame
3. Store each dataframe in list
4. Concatenate all dataframes into one dataset
***

In [4]:
mAllArtistDFList = []
for aArtist in mArtistList:
    aArtistDFList = []
    for aCurOffset in range(0,150,50):
        aTopSongSearchResult = mSpotifyClient.search(q=aArtist['name'],type='track',market='CA',offset=aCurOffset,limit=50)
        aTopSongDF =mPandasObj.DataFrame(aTopSongSearchResult['tracks']['items'],columns=['id','name','popularity'])
        aArtistDFList.append(aTopSongDF)
    aArtistTopSongDF = mPandasObj.concat(aArtistDFList)
    aArtistTopSongDF['artist_id'] = aArtist['id']
    mAllArtistDFList.append(aArtistTopSongDF)
mTopDJSongsDF = mPandasObj.concat(mAllArtistDFList)
mTopDJSongsDF.shape

(14617, 4)

## Get Audio Features of All songs in TopSongs DF
***
1. Use Audio Features function to get all audio features
2. Get 'id', 'key', 'mode', 'time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness' ,'liveness' ,'loudness' ,'speechiness' ,'valence' ,'tempo' ,'duration_ms' and concatenate into one DF
3. Store each dataframe in list
4. Concatenate all dataframes into one dataset
***

In [6]:
mAllArtistFeatureDFList = []
for aArtistDF in mAllArtistDFList:
    aIDList = aArtistDF[:100]['id'].tolist()
    aAudioFeatures = mSpotifyClient.audio_features(tracks = aIDList)
    aFeatureDictList = []
    for aAF in aAudioFeatures:
        if aAF is not None:
            aReqValF = {'id':aAF['id'],'key':aAF['key'],'mode':aAF['mode'],'time_signature':aAF['time_signature'],\
                        'acousticness':aAF['acousticness'], 'danceability':aAF['danceability'],'energy':aAF['energy'],\
                       'instrumentalness':aAF['instrumentalness'],'liveness':aAF['liveness'],'loudness':aAF['loudness'],\
                       'speechiness':aAF['speechiness'],'valence':aAF['valence'],'tempo':aAF['tempo'],'duration_ms':aAF['duration_ms']}
            aFeatureDictList.append(aReqValF)
    aIDList = aArtistDF[-50:]['id'].tolist()
    aAudioFeatures = mSpotifyClient.audio_features(tracks = aIDList)
    for aAF in aAudioFeatures:
        if aAF is not None:
            aReqValF = {'id':aAF['id'],'key':aAF['key'],'mode':aAF['mode'],'time_signature':aAF['time_signature'],\
                        'acousticness':aAF['acousticness'], 'danceability':aAF['danceability'],'energy':aAF['energy'],\
                       'instrumentalness':aAF['instrumentalness'],'liveness':aAF['liveness'],'loudness':aAF['loudness'],\
                       'speechiness':aAF['speechiness'],'valence':aAF['valence'],'tempo':aAF['tempo'],'duration_ms':aAF['duration_ms']}
            aFeatureDictList.append(aReqValF)
    aFeatureDF = mPandasObj.DataFrame(aFeatureDictList)
    mAllArtistFeatureDFList.append(aFeatureDF)
mSongFeaturesDF = mPandasObj.concat(mAllArtistFeatureDFList)
mSongFeaturesDF.shape

(14917, 14)

## Remove Songs without Audio Features
***
1. Songs that have missing audio features can't be used in Dataset so will be removed
***

In [7]:
mSongFeaturesDF.drop_duplicates(inplace=True)
mTopDJSongsDF = mTopDJSongsDF[mTopDJSongsDF['id'].isin(mSongFeaturesDF['id'])]
mTopDJSongsDF.drop_duplicates(inplace=True)

In [8]:
mSongFeaturesDF.shape

(13959, 14)

In [9]:
mTopDJSongsDF.shape

(14608, 4)

## Create Datasets for Audio Analysis of Songs
***
1. Use track ID from TopSongs to fetch Audio Analysis of Songs
2. Create various tables for all Audio Analysis values
3. Store and save as Datasets
***

In [10]:
def AddSongsToAnalysisDF(pSongIDs,pSegmentID):
    mMissingAnalysisIDs = []
    mSongTimeFrameDF = []
    mSongSectionsDF = []
    mSongSegmentsDF = []
    mSongPitchesDF = []
    mSongTimbreDF = []
    for aSongID in pSongIDs:
        aSongAnalysisObj = None
        try:
            aSongAnalysisObj = mSpotifyClient.audio_analysis(aSongID)
        except:
            aSongAnalysisObj = None
            print("Can't find Song Analysis for Song", aSongID)
        if aSongAnalysisObj is None:
            mMissingAnalysisIDs.append(aSongID)
            continue
        aTempList = []
        for aBars in aSongAnalysisObj['bars']:
            aReqBarVal = {'song_id':aSongID,'start':aBars['start'],'duration':aBars['duration'],'confidence':aBars['confidence'],\
                         'type':'Bar'}
            aTempList.append(aReqBarVal)
        for aBeats in aSongAnalysisObj['beats']:
            aReqBeatVal = {'song_id':aSongID,'start':aBeats['start'],'duration':aBeats['duration'],'confidence':aBeats['confidence'],\
                         'type':'Beat'}
            aTempList.append(aReqBeatVal)
        for aTatums in aSongAnalysisObj['tatums']:
            aReqTatumVal = {'song_id':aSongID,'start':aTatums['start'],'duration':aTatums['duration'],'confidence':aTatums['confidence'],\
                         'type':'Tatum'}
            aTempList.append(aReqTatumVal)
        mSongTimeFrameDF.append(mPandasObj.DataFrame(aTempList))
        aTempList = []
        for aSections in aSongAnalysisObj['sections']:
            aReqSectVal = {'song_id':aSongID, 'start':aSections['start'], 'duration':aSections['duration'],\
                           'confidence':aSections['confidence'], 'loudness':aSections['loudness'], 'tempo':aSections['tempo'],\
                           'tempo_confidence':aSections['tempo_confidence'], 'key':aSections['key'],\
                           'key_confidence':aSections['key_confidence'], 'mode':aSections['mode'],\
                           'mode_confidence':aSections['mode_confidence'], 'time_signature':aSections['time_signature'],\
                           'time_signature_confidence':aSections['time_signature_confidence']}
            aTempList.append(aReqSectVal)
        mSongSectionsDF.append(mPandasObj.DataFrame(aTempList))
        aTempList = []
        aTimbreList = []
        aPitchList = []
        for aSegments in aSongAnalysisObj['segments']:
            aReqSegVal = {'song_id':aSongID,'id':pSegmentID, 'start':aSegments['start'], 'duration':aSegments['duration'],\
                           'confidence':aSegments['confidence'], 'loudness_start':aSegments['loudness_start'],\
                         'loudness_max':aSegments['loudness_max'], 'loudness_max_time':aSegments['loudness_max_time'],\
                         'loudness_end':aSegments['loudness_end']}
            aTempList.append(aReqSegVal)
            aReqPitchVal = {'segment_id':pSegmentID, 'I':aSegments['pitches'][0] ,'II':aSegments['pitches'][1] ,\
                            'III':aSegments['pitches'][2] ,'IV':aSegments['pitches'][3] ,'V':aSegments['pitches'][4] ,\
                            'VI':aSegments['pitches'][5] ,'VII':aSegments['pitches'][6] ,'VIII':aSegments['pitches'][7] ,\
                            'IX':aSegments['pitches'][8] ,'X':aSegments['pitches'][9] ,'XI':aSegments['pitches'][10] ,\
                            'XII':aSegments['pitches'][11] }
            aPitchList.append(aReqPitchVal)
            aReqTimbreVal = {'segment_id':pSegmentID, 'I':aSegments['timbre'][0] ,'II':aSegments['timbre'][1] ,\
                            'III':aSegments['timbre'][2] ,'IV':aSegments['timbre'][3] ,'V':aSegments['timbre'][4] ,\
                            'VI':aSegments['timbre'][5] ,'VII':aSegments['timbre'][6] ,'VIII':aSegments['timbre'][7] ,\
                            'IX':aSegments['timbre'][8] ,'X':aSegments['timbre'][9] ,'XI':aSegments['timbre'][10] ,\
                            'XII':aSegments['timbre'][11] }
            aTimbreList.append(aReqTimbreVal)
            pSegmentID = pSegmentID + 1
        mSongSegmentsDF.append(mPandasObj.DataFrame(aTempList))
        mSongPitchesDF.append(mPandasObj.DataFrame(aPitchList))
        mSongTimbreDF.append(mPandasObj.DataFrame(aTimbreList))
    return mMissingAnalysisIDs, mSongTimeFrameDF, mSongSectionsDF, mSongSegmentsDF, mSongPitchesDF, mSongTimbreDF, pSegmentID

In [13]:
mMissingAnalysisIDs = []
mSongTimeFrameDF = []
mSongSectionsDF = []
mSongSegmentsDF = []
mSongPitchesDF = []
mSongTimbreDF = []

aSegmentID = 1

mSongIDs = mTopDJSongsDF['id']
mSongIDSplits = mNP.array_split(mSongIDs, 200)

aTotalSongs = 0

for aSongIDList in mSongIDSplits:
    aMissingIDs, aTimeFrameDF, aSectionsDF, aSegmentDF, aPitchDF, aTimbreDF, aSegmentID =\
    AddSongsToAnalysisDF(aSongIDList, aSegmentID)
    mMissingAnalysisIDs.extend(aMissingIDs)
    mSongTimeFrameDF.extend(aTimeFrameDF)
    mSongSectionsDF.extend(aSectionsDF)
    mSongSegmentsDF.extend(aSegmentDF)
    mSongPitchesDF.extend(aPitchDF)
    mSongTimbreDF.extend(aTimbreDF)
    aTotalSongs = aTotalSongs + len(aSongIDList)
    print("Songs Scraped = ",aTotalSongs, "Missing Values = ", len(mMissingAnalysisIDs))

#Retry once more for all missing values before discarding them completely
aMissingIDs, aTimeFrameDF, aSectionsDF, aSegmentDF, aPitchDF, aTimbreDF, aSegmentID =\
AddSongsToAnalysisDF(mMissingAnalysisIDs, aSegmentID)
mMissingAnalysisIDs = aMissingIDs
if len(aTimeFrameDF) > 0:
    mSongTimeFrameDF.extend(aTimeFrameDF)
if len(aSectionsDF) > 0:
    mSongSectionsDF.extend(aSectionsDF)
if len(aSegmentDF) > 0:
    mSongSegmentsDF.extend(aSegmentDF)
if len(aPitchDF) > 0:
    mSongPitchesDF.extend(aPitchDF)
if len(aTimbreDF) > 0:
    mSongTimbreDF.extend(aTimbreDF)

mSongTimeFrameDF = mPandasObj.concat(mSongTimeFrameDF)
mSongSectionsDF = mPandasObj.concat(mSongSectionsDF)
mSongSegmentsDF = mPandasObj.concat(mSongSegmentsDF)
mSongPitchesDF = mPandasObj.concat(mSongPitchesDF)
mSongTimbreDF = mPandasObj.concat(mSongTimbreDF)

Songs Scraped =  74 Missing Values =  0
Songs Scraped =  148 Missing Values =  0
Songs Scraped =  222 Missing Values =  0
Songs Scraped =  296 Missing Values =  0
Songs Scraped =  370 Missing Values =  0
Songs Scraped =  444 Missing Values =  0
Songs Scraped =  518 Missing Values =  0
Songs Scraped =  592 Missing Values =  0
Songs Scraped =  665 Missing Values =  0
Songs Scraped =  738 Missing Values =  0
Songs Scraped =  811 Missing Values =  0
Songs Scraped =  884 Missing Values =  0
Songs Scraped =  957 Missing Values =  0
Songs Scraped =  1030 Missing Values =  0
Can't find Song Analysis for Song 0NIC4unbe5KZOp1d9T7OaF
Songs Scraped =  1103 Missing Values =  1
Can't find Song Analysis for Song 6Uq65N2DJzKwBj8QBcJ5Xu
Songs Scraped =  1176 Missing Values =  2
Songs Scraped =  1249 Missing Values =  2
Can't find Song Analysis for Song 318BLBNwOt9H0RrQv0Eiwj
Songs Scraped =  1322 Missing Values =  3
Songs Scraped =  1395 Missing Values =  3
Can't find Song Analysis for Song 1BeTdfec8Ko

Max Retries reached


Can't find Song Analysis for Song 2R5eUSyhOKNNPw7zzLZwEL
Songs Scraped =  4096 Missing Values =  9
Songs Scraped =  4169 Missing Values =  9
Songs Scraped =  4242 Missing Values =  9
Can't find Song Analysis for Song 7IHOijI3bz8jHRvhXzPks9
Songs Scraped =  4315 Missing Values =  10
Songs Scraped =  4388 Missing Values =  10
Songs Scraped =  4461 Missing Values =  10
Songs Scraped =  4534 Missing Values =  10
Songs Scraped =  4607 Missing Values =  10
Songs Scraped =  4680 Missing Values =  10
Can't find Song Analysis for Song 1V6WjZGuJGH0Wg9HbaqVQ3
Songs Scraped =  4753 Missing Values =  11
Can't find Song Analysis for Song 5Pth046ihwuyXcetLXtQdn
Songs Scraped =  4826 Missing Values =  12
Songs Scraped =  4899 Missing Values =  12
Songs Scraped =  4972 Missing Values =  12
Songs Scraped =  5045 Missing Values =  12


Max Retries reached


Can't find Song Analysis for Song 1j5BvC67aRkHaqX93G0Et4
Songs Scraped =  5118 Missing Values =  13


Max Retries reached


Can't find Song Analysis for Song 5CRUYbExDj1jaqhhY0oxrQ
Songs Scraped =  5191 Missing Values =  14


Max Retries reached


Can't find Song Analysis for Song 4joSJSEcddADmMgXSl9cvj
Songs Scraped =  5264 Missing Values =  15
Songs Scraped =  5337 Missing Values =  15
Songs Scraped =  5410 Missing Values =  15
Can't find Song Analysis for Song 1G0wfuX5JUYIh7UfJj1vy9
Songs Scraped =  5483 Missing Values =  16
Can't find Song Analysis for Song 1BfLBkvFu8LELJswBOJuOJ
Can't find Song Analysis for Song 643zcsE7vA53LOJ9oiX8ry
Songs Scraped =  5556 Missing Values =  18
Can't find Song Analysis for Song 5S72wG4ALVc120dvJAXLN2
Songs Scraped =  5629 Missing Values =  19
Songs Scraped =  5702 Missing Values =  19
Can't find Song Analysis for Song 0CefCZvrNuTyPn9SA695Na
Songs Scraped =  5775 Missing Values =  20
Songs Scraped =  5848 Missing Values =  20
Songs Scraped =  5921 Missing Values =  20
Songs Scraped =  5994 Missing Values =  20
Can't find Song Analysis for Song 1iJHitxTcTARB8P387PcZL
Songs Scraped =  6067 Missing Values =  21
Songs Scraped =  6140 Missing Values =  21
Can't find Song Analysis for Song 6ldG9ZHz

Max Retries reached


Can't find Song Analysis for Song 5Eovehbodc2RnxbX5jNziD
Songs Scraped =  11761 Missing Values =  42
Songs Scraped =  11834 Missing Values =  42
Songs Scraped =  11907 Missing Values =  42
Songs Scraped =  11980 Missing Values =  42
Songs Scraped =  12053 Missing Values =  42
Songs Scraped =  12126 Missing Values =  42
Can't find Song Analysis for Song 63jDDzy3lKkkN2DneZne2v
Can't find Song Analysis for Song 2aVXRvnIvlrnBYM2p0FA21
Can't find Song Analysis for Song 4Br5TcVQMAl0y3jIGaMzop
Songs Scraped =  12199 Missing Values =  45
Songs Scraped =  12272 Missing Values =  45
Songs Scraped =  12345 Missing Values =  45
Songs Scraped =  12418 Missing Values =  45


Max Retries reached


Can't find Song Analysis for Song 7GmduX2jLvglcqwy3AoKd2
Songs Scraped =  12491 Missing Values =  46


Max Retries reached


Can't find Song Analysis for Song 0x54YhUlMCBANwNVpZwVLT
Can't find Song Analysis for Song 5lyAkZPAC2ydlJDKSKYODo
Songs Scraped =  12564 Missing Values =  48
Songs Scraped =  12637 Missing Values =  48
Songs Scraped =  12710 Missing Values =  48


HTTP Error for GET to https://api.spotify.com/v1/audio-analysis/5OgeotnaQT50D8A6DzeASa returned 404 due to analysis not found


Can't find Song Analysis for Song 5OgeotnaQT50D8A6DzeASa
Can't find Song Analysis for Song 7zCFk8BECFUdp8VU39Daaa
Songs Scraped =  12783 Missing Values =  50
Songs Scraped =  12856 Missing Values =  50
Songs Scraped =  12929 Missing Values =  50
Songs Scraped =  13002 Missing Values =  50
Songs Scraped =  13075 Missing Values =  50
Songs Scraped =  13148 Missing Values =  50
Songs Scraped =  13221 Missing Values =  50
Songs Scraped =  13294 Missing Values =  50
Songs Scraped =  13367 Missing Values =  50


Max Retries reached


Can't find Song Analysis for Song 6jOKMYPz4v8BQpViYAMuhM
Songs Scraped =  13440 Missing Values =  51
Songs Scraped =  13513 Missing Values =  51
Songs Scraped =  13586 Missing Values =  51
Songs Scraped =  13659 Missing Values =  51
Can't find Song Analysis for Song 6b5q26NZnb52jVJlOUlWpx
Songs Scraped =  13732 Missing Values =  52
Songs Scraped =  13805 Missing Values =  52
Songs Scraped =  13878 Missing Values =  52
Songs Scraped =  13951 Missing Values =  52
Songs Scraped =  14024 Missing Values =  52
Songs Scraped =  14097 Missing Values =  52
Songs Scraped =  14170 Missing Values =  52
Songs Scraped =  14243 Missing Values =  52
Songs Scraped =  14316 Missing Values =  52
Songs Scraped =  14389 Missing Values =  52
Songs Scraped =  14462 Missing Values =  52
Songs Scraped =  14535 Missing Values =  52


Max Retries reached


Can't find Song Analysis for Song 2gAna3NvvtxLOw4d06YbaU
Songs Scraped =  14608 Missing Values =  53


Max Retries reached


Can't find Song Analysis for Song 2R5eUSyhOKNNPw7zzLZwEL


Max Retries reached


Can't find Song Analysis for Song 1j5BvC67aRkHaqX93G0Et4


Max Retries reached


Can't find Song Analysis for Song 5CRUYbExDj1jaqhhY0oxrQ


Max Retries reached


Can't find Song Analysis for Song 4joSJSEcddADmMgXSl9cvj


Max Retries reached


Can't find Song Analysis for Song 5Eovehbodc2RnxbX5jNziD


Max Retries reached


Can't find Song Analysis for Song 7GmduX2jLvglcqwy3AoKd2


Max Retries reached


Can't find Song Analysis for Song 0x54YhUlMCBANwNVpZwVLT


HTTP Error for GET to https://api.spotify.com/v1/audio-analysis/5OgeotnaQT50D8A6DzeASa returned 404 due to analysis not found


Can't find Song Analysis for Song 5OgeotnaQT50D8A6DzeASa


Max Retries reached


Can't find Song Analysis for Song 6jOKMYPz4v8BQpViYAMuhM


Max Retries reached


Can't find Song Analysis for Song 2gAna3NvvtxLOw4d06YbaU


## Remove Songs without Audio Analysis
***
1. Songs that have missing audio analysis can't be used in Dataset so will be removed
***

In [14]:
mTopDJSongsDF = mTopDJSongsDF[~mTopDJSongsDF['id'].isin(mMissingAnalysisIDs)]
mSongFeaturesDF = mSongFeaturesDF[~mSongFeaturesDF['id'].isin(mMissingAnalysisIDs)]

In [15]:
mTopDJSongsDF.drop_duplicates(inplace=True)
mTopDJSongsDF.shape

(14598, 4)

In [16]:
mSongFeaturesDF.drop_duplicates(inplace=True)
mSongFeaturesDF.shape

(13949, 14)

In [17]:
mArtistDataset.drop_duplicates(inplace=True)
mArtistDataset.shape

(100, 3)

In [18]:
mSongTimeFrameDF.drop_duplicates(inplace=True)
mSongTimeFrameDF.shape

(23857633, 5)

In [19]:
mSongSectionsDF.drop_duplicates(inplace=True)
mSongSectionsDF.shape

(147062, 13)

In [20]:
mSongSegmentsDF.drop_duplicates(inplace=True)
mSongSegmentsDF.shape

(15761732, 9)

In [21]:
mSongPitchesDF.drop_duplicates(inplace=True)
mSongPitchesDF.shape

(15761732, 13)

In [22]:
mSongTimbreDF.drop_duplicates(inplace=True)
mSongTimbreDF.shape

(15761732, 13)

## Store all DataFrames as CSV Files
***
1. Store all eight dataframes as CSV Files by using to_csv
2. These datasets will then be used in another file for EDA
3. After EDA and Model selection, the datasets will be broken down in Train, Test, Validate for Model
***

In [23]:
mArtistDataset.to_csv('Top100Artists.csv', index=False)
mTopDJSongsDF.to_csv('TopSongsByArtists.csv', index=False)
mSongFeaturesDF.to_csv('TopSongAudioFeatures.csv', index=False)
mSongTimeFrameDF.to_csv('SongTimeFrameData.csv', index=False)
mSongSectionsDF.to_csv('SongSectionData.csv', index=False)
mSongSegmentsDF.to_csv('SongSegmentData.csv', index=False)
mSongPitchesDF.to_csv('SegmentPitchesData.csv', index=False)
mSongTimbreDF.to_csv('SegmentTimbreData.csv', index=False)