///-------------------------------------------------------------------------------------------------<br>
// File: Dataset Preparation.ipynb<br>
//<br>
// Author: Dakshvir Singh Rehill<br>
// Date: 14/10/2020<br>
//<br>
// Summary:	This notebook is used to generate the dataset from DJMag Top 100 DJs and Spotify API<br>
///-------------------------------------------------------------------------------------------------
***

## Get Top 100 DJs from DJMag
***
1. Import requests package
2. Use requests to fetch Top 100 page from DJMag
3. Import beautifulsoup package
4. Use beautifulsoup to get DJ name
***

In [1]:
import requests
from bs4 import BeautifulSoup
mScrapeURL = 'https://djmag.com/top100dj?year=2019'
mPageHTML = requests.get(mScrapeURL)
mParsedObject = BeautifulSoup(mPageHTML.content, 'html.parser')
mDJNameElemList = mParsedObject.find_all('div',class_='top100dj-name')
mDJNamesList = []
for aDJNameElem in mDJNameElemList:
    mDJNamesList.append(aDJNameElem.find('a').text)

## Get DJ Popularity, ID from Spotify API
***
1. Import spotipy package
2. Use spotipy to set up App Credentials
3. Search for each artist
5. Import pandas package
4. Create DataFrame with Artist Details
***

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as mNP
import pandas as mPandasObj
from IPython.display import display
mSpotifyClient = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7f839484fcf4450db86dcc9126cec5e1",\
                                client_secret="80e9b5a936a44b2880354a719ef54019", requests_timeout = None))

In [3]:
mArtistList = []
for aArtistName in mDJNamesList:
    aSelectedArtist = {'id' : '','name' : '','popularity' : 0}
    aSearchResult = mSpotifyClient.search(q=aArtistName,type='artist',market='CA')
    for aArtist in aSearchResult['artists']['items']:
        if aSelectedArtist['popularity'] <= aArtist['popularity']:
            aSelectedArtist['id'] = aArtist['id']
            aSelectedArtist['name'] = aArtist['name']
            aSelectedArtist['popularity'] = aArtist['popularity']
    mArtistList.append(aSelectedArtist)
mArtistDataset = mPandasObj.DataFrame(mArtistList)
mArtistDataset.shape

(100, 3)

## Get Top 150 songs of each Artist
***
1. Search for type track using artist name
2. Get ID, Name, Popularity of Track and concatenate into Artist DataFrame
3. Store each dataframe in list
4. Concatenate all dataframes into one dataset
***

In [4]:
mAllArtistDFList = []
for aArtist in mArtistList:
    aArtistDFList = []
    for aCurOffset in range(0,150,50):
        aTopSongSearchResult = mSpotifyClient.search(q=aArtist['name'],type='track',market='CA',offset=aCurOffset,limit=50)
        aTopSongDF =mPandasObj.DataFrame(aTopSongSearchResult['tracks']['items'],columns=['id','name','popularity'])
        aArtistDFList.append(aTopSongDF)
    aArtistTopSongDF = mPandasObj.concat(aArtistDFList)
    aArtistTopSongDF['artist_id'] = aArtist['id']
    mAllArtistDFList.append(aArtistTopSongDF)
mTopDJSongsDF = mPandasObj.concat(mAllArtistDFList)
mTopDJSongsDF.shape

(14617, 4)

## Get Audio Features of All songs in TopSongs DF
***
1. Use Audio Features function to get all audio features
2. Get 'id', 'key', 'mode', 'time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness' ,'liveness' ,'loudness' ,'speechiness' ,'valence' ,'tempo' ,'duration_ms' and concatenate into one DF
3. Store each dataframe in list
4. Concatenate all dataframes into one dataset
***

In [6]:
mAllArtistFeatureDFList = []
for aArtistDF in mAllArtistDFList:
    aIDList = aArtistDF[:100]['id'].tolist()
    aAudioFeatures = mSpotifyClient.audio_features(tracks = aIDList)
    aFeatureDictList = []
    for aAF in aAudioFeatures:
        if aAF is not None:
            aReqValF = {'id':aAF['id'],'key':aAF['key'],'mode':aAF['mode'],'time_signature':aAF['time_signature'],\
                        'acousticness':aAF['acousticness'], 'danceability':aAF['danceability'],'energy':aAF['energy'],\
                       'instrumentalness':aAF['instrumentalness'],'liveness':aAF['liveness'],'loudness':aAF['loudness'],\
                       'speechiness':aAF['speechiness'],'valence':aAF['valence'],'tempo':aAF['tempo'],'duration_ms':aAF['duration_ms']}
            aFeatureDictList.append(aReqValF)
    aIDList = aArtistDF[-50:]['id'].tolist()
    aAudioFeatures = mSpotifyClient.audio_features(tracks = aIDList)
    for aAF in aAudioFeatures:
        if aAF is not None:
            aReqValF = {'id':aAF['id'],'key':aAF['key'],'mode':aAF['mode'],'time_signature':aAF['time_signature'],\
                        'acousticness':aAF['acousticness'], 'danceability':aAF['danceability'],'energy':aAF['energy'],\
                       'instrumentalness':aAF['instrumentalness'],'liveness':aAF['liveness'],'loudness':aAF['loudness'],\
                       'speechiness':aAF['speechiness'],'valence':aAF['valence'],'tempo':aAF['tempo'],'duration_ms':aAF['duration_ms']}
            aFeatureDictList.append(aReqValF)
    aFeatureDF = mPandasObj.DataFrame(aFeatureDictList)
    mAllArtistFeatureDFList.append(aFeatureDF)
mSongFeaturesDF = mPandasObj.concat(mAllArtistFeatureDFList)
mSongFeaturesDF.shape

(14917, 14)

## Remove Songs without Audio Features
***
1. Songs that have missing audio features can't be used in Dataset so will be removed
***

In [7]:
mSongFeaturesDF.drop_duplicates(inplace=True)
mTopDJSongsDF = mTopDJSongsDF[mTopDJSongsDF['id'].isin(mSongFeaturesDF['id'])]
mTopDJSongsDF.drop_duplicates(inplace=True)

In [8]:
mSongFeaturesDF.shape

(13959, 14)

In [9]:
mTopDJSongsDF.shape

(14608, 4)

## Create Datasets for Audio Analysis of Songs
***
1. Use track ID from TopSongs to fetch Audio Analysis of Songs
2. Create various tables for all Audio Analysis values
3. Store and save as Datasets
***

In [10]:
def AddSongsToAnalysisDF(pSongIDs,pSegmentID):
    mMissingAnalysisIDs = []
    mSongTimeFrameDF = []
    mSongSectionsDF = []
    mSongSegmentsDF = []
    mSongPitchesDF = []
    mSongTimbreDF = []
    for aSongID in pSongIDs:
        aSongAnalysisObj = None
        try:
            aSongAnalysisObj = mSpotifyClient.audio_analysis(aSongID)
        except:
            aSongAnalysisObj = None
            print("Can't find Song Analysis for Song", aSongID)
        if aSongAnalysisObj is None:
            mMissingAnalysisIDs.append(aSongID)
            continue
        aTempList = []
        for aBars in aSongAnalysisObj['bars']:
            aReqBarVal = {'song_id':aSongID,'start':aBars['start'],'duration':aBars['duration'],'confidence':aBars['confidence'],\
                         'type':'Bar'}
            aTempList.append(aReqBarVal)
        for aBeats in aSongAnalysisObj['beats']:
            aReqBeatVal = {'song_id':aSongID,'start':aBeats['start'],'duration':aBeats['duration'],'confidence':aBeats['confidence'],\
                         'type':'Beat'}
            aTempList.append(aReqBeatVal)
        for aTatums in aSongAnalysisObj['tatums']:
            aReqTatumVal = {'song_id':aSongID,'start':aTatums['start'],'duration':aTatums['duration'],'confidence':aTatums['confidence'],\
                         'type':'Tatum'}
            aTempList.append(aReqTatumVal)
        mSongTimeFrameDF.append(mPandasObj.DataFrame(aTempList))
        aTempList = []
        for aSections in aSongAnalysisObj['sections']:
            aReqSectVal = {'song_id':aSongID, 'start':aSections['start'], 'duration':aSections['duration'],\
                           'confidence':aSections['confidence'], 'loudness':aSections['loudness'], 'tempo':aSections['tempo'],\
                           'tempo_confidence':aSections['tempo_confidence'], 'key':aSections['key'],\
                           'key_confidence':aSections['key_confidence'], 'mode':aSections['mode'],\
                           'mode_confidence':aSections['mode_confidence'], 'time_signature':aSections['time_signature'],\
                           'time_signature_confidence':aSections['time_signature_confidence']}
            aTempList.append(aReqSectVal)
        mSongSectionsDF.append(mPandasObj.DataFrame(aTempList))
        aTempList = []
        aTimbreList = []
        aPitchList = []
        for aSegments in aSongAnalysisObj['segments']:
            aReqSegVal = {'song_id':aSongID,'id':pSegmentID, 'start':aSegments['start'], 'duration':aSegments['duration'],\
                           'confidence':aSegments['confidence'], 'loudness_start':aSegments['loudness_start'],\
                         'loudness_max':aSegments['loudness_max'], 'loudness_max_time':aSegments['loudness_max_time'],\
                         'loudness_end':aSegments['loudness_end']}
            aTempList.append(aReqSegVal)
            aReqPitchVal = {'segment_id':pSegmentID, 'I':aSegments['pitches'][0] ,'II':aSegments['pitches'][1] ,\
                            'III':aSegments['pitches'][2] ,'IV':aSegments['pitches'][3] ,'V':aSegments['pitches'][4] ,\
                            'VI':aSegments['pitches'][5] ,'VII':aSegments['pitches'][6] ,'VIII':aSegments['pitches'][7] ,\
                            'IX':aSegments['pitches'][8] ,'X':aSegments['pitches'][9] ,'XI':aSegments['pitches'][10] ,\
                            'XII':aSegments['pitches'][11] }
            aPitchList.append(aReqPitchVal)
            aReqTimbreVal = {'segment_id':pSegmentID, 'I':aSegments['timbre'][0] ,'II':aSegments['timbre'][1] ,\
                            'III':aSegments['timbre'][2] ,'IV':aSegments['timbre'][3] ,'V':aSegments['timbre'][4] ,\
                            'VI':aSegments['timbre'][5] ,'VII':aSegments['timbre'][6] ,'VIII':aSegments['timbre'][7] ,\
                            'IX':aSegments['timbre'][8] ,'X':aSegments['timbre'][9] ,'XI':aSegments['timbre'][10] ,\
                            'XII':aSegments['timbre'][11] }
            aTimbreList.append(aReqTimbreVal)
            pSegmentID = pSegmentID + 1
        mSongSegmentsDF.append(mPandasObj.DataFrame(aTempList))
        mSongPitchesDF.append(mPandasObj.DataFrame(aPitchList))
        mSongTimbreDF.append(mPandasObj.DataFrame(aTimbreList))
    return mMissingAnalysisIDs, mSongTimeFrameDF, mSongSectionsDF, mSongSegmentsDF, mSongPitchesDF, mSongTimbreDF, pSegmentID

In [None]:
mMissingAnalysisIDs = []
mSongTimeFrameDF = []
mSongSectionsDF = []
mSongSegmentsDF = []
mSongPitchesDF = []
mSongTimbreDF = []
aSegmentID = 1
mSongIDs = mTopDJSongsDF['id']
mSongIDSplits = mNP.array_split(mSongIDs, 200)
for aSongIDList in mSongIDSplits:
    aMissingIDs, aTimeFrameDF, aSectionsDF, aSegmentDF, aPitchDF, aTimbreDF, aSegmentID =\
    AddSongsToAnalysisDF(aSongIDList, aSegmentID)
    mMissingAnalysisIDs.extend(aMissingIDs)
    mSongTimeFrameDF.append(aTimeFrameDF)
    mSongSectionsDF.append(aSectionsDF)
    mSongSegmentsDF.append(aSegmentDF)
    mSongPitchesDF.append(aPitchDF)
    mSongTimbreDF.append(aTimbreDF)
    print("Songs Scraped = ",len(aSongIDList), "Missing Values = ", len(aMissingIDs),"Shape Test =", aTimeFrameDF[0].shape)
mSongTimeFrameDF = mPandasObj.concat(mSongTimeFrameDF)
mSongSectionsDF = mPandasObj.concat(mSongSectionsDF)
mSongSegmentsDF = mPandasObj.concat(mSongSegmentsDF)
mSongPitchesDF = mPandasObj.concat(mSongPitchesDF)
mSongTimbreDF = mPandasObj.concat(mSongTimbreDF)

Songs Scraped =  74 Missing Values =  0 Shape Test = (996, 5)
Songs Scraped =  74 Missing Values =  0 Shape Test = (1428, 5)
Songs Scraped =  74 Missing Values =  0 Shape Test = (1378, 5)
Songs Scraped =  74 Missing Values =  0 Shape Test = (1349, 5)
Songs Scraped =  74 Missing Values =  0 Shape Test = (2295, 5)
Can't find Song Analysis for Song 6JH56gZC7EJDcoxabVcWVL
Songs Scraped =  74 Missing Values =  1 Shape Test = (1553, 5)
Songs Scraped =  74 Missing Values =  0 Shape Test = (2412, 5)
Songs Scraped =  74 Missing Values =  0 Shape Test = (1170, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1315, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1605, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (2257, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1303, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1285, 5)
Can't find Song Analysis for Song 4WiBtKNGtgjYaxb546pnau
Songs Scraped =  73 Missing Values =  1 Shape Test = (1501, 5)
Songs

Max Retries reached


Can't find Song Analysis for Song 2R5eUSyhOKNNPw7zzLZwEL
Songs Scraped =  73 Missing Values =  1 Shape Test = (1297, 5)
Can't find Song Analysis for Song 0ZvfgQqegIyGLCl22uTssf
Songs Scraped =  73 Missing Values =  1 Shape Test = (792, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1651, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1741, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1700, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (997, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (2600, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1631, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1592, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1437, 5)
Can't find Song Analysis for Song 5G1FutZrc8F7VuFnd4r6GG
Songs Scraped =  73 Missing Values =  1 Shape Test = (3005, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (3304, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1001, 5)
Songs Scrape

Max Retries reached


Can't find Song Analysis for Song 1j5BvC67aRkHaqX93G0Et4
Songs Scraped =  73 Missing Values =  1 Shape Test = (1244, 5)


Max Retries reached


Can't find Song Analysis for Song 5CRUYbExDj1jaqhhY0oxrQ
Songs Scraped =  73 Missing Values =  1 Shape Test = (2868, 5)


Max Retries reached


Can't find Song Analysis for Song 4joSJSEcddADmMgXSl9cvj
Songs Scraped =  73 Missing Values =  1 Shape Test = (1920, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1315, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1637, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1426, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1560, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1229, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (2188, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1607, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (1496, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (419, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (610, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (743, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (2233, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (2583, 5)
Songs Scraped =  73 Missing Values =  0 Shape Test = (2336, 5)
C

## Remove Songs without Audio Analysis
***
1. Songs that have missing audio analysis can't be used in Dataset so will be removed
***

In [None]:
mTopDJSongsDF = mTopDJSongsDF[~mTopDJSongsDF['id'].isin(mMissingAnalysisIDs)]
mSongFeaturesDF = mSongFeaturesDF[~mSongFeaturesDF['id'].isin(mMissingAnalysisIDs)]

In [None]:
mTopDJSongsDF.drop_duplicates(inplace=True)
mTopDJSongsDF.shape

In [None]:
mSongFeaturesDF.drop_duplicates(inplace=True)
mSongFeaturesDF.shape

In [None]:
mArtistDataset.drop_duplicates(inplace=True)
mArtistDataset.shape

In [None]:
mSongTimeFrameDF.drop_duplicates(inplace=True)
mSongTimeFrameDF.shape

In [None]:
mSongSectionsDF.drop_duplicates(inplace=True)
mSongSectionsDF.shape

In [None]:
mSongSegmentsDF.drop_duplicates(inplace=True)
mSongSegmentsDF.shape

In [None]:
mSongPitchesDF.drop_duplicates(inplace=True)
mSongPitchesDF.shape

In [None]:
mSongTimbreDF.drop_duplicates(inplace=True)
mSongTimbreDF.shape

## Store all DataFrames as CSV Files
***
1. Store all eight dataframes as CSV Files by using to_csv
2. These datasets will then be used in another file for EDA
3. After EDA and Model selection, the datasets will be broken down in Train, Test, Validate for Model
***

In [None]:
mArtistDataset.to_csv('Top100Artists.csv', index=False)
mTopDJSongsDF.to_csv('TopSongsByArtists.csv', index=False)
mSongFeaturesDF.to_csv('TopSongAudioFeatures.csv', index=False)
mSongTimeFrameDF.to_csv('SongTimeFrameData.csv', index=False)
mSongSectionsDF.to_csv('SongSectionData.csv', index=False)
mSongSegmentsDF.to_csv('SongSegmentData.csv', index=False)
mSongPitchesDF.to_csv('SegmentPitchesData.csv', index=False)
mSongTimbreDF.to_csv('SegmentTimbreData.csv', index=False)