///-------------------------------------------------------------------------------------------------<br>
// File: Dataset Preparation.ipynb<br>
//<br>
// Author: Dakshvir Singh Rehill<br>
// Date: 14/10/2020<br>
//<br>
// Summary:	This notebook is used to generate the dataset from DJMag Top 100 DJs and Spotify API<br>
///-------------------------------------------------------------------------------------------------
***

## Get Top 100 DJs from DJMag
***
1. Import requests package
2. Use requests to fetch Top 100 page from DJMag
3. Import beautifulsoup package
4. Use beautifulsoup to get DJ name
***

In [None]:
import requests
from bs4 import BeautifulSoup
mScrapeURL = 'https://djmag.com/top100dj?year=2019'
mPageHTML = requests.get(mScrapeURL)
mParsedObject = BeautifulSoup(mPageHTML.content, 'html.parser')
mDJNameElemList = mParsedObject.find_all('div',class_='top100dj-name')
mDJNamesList = []
for aDJNameElem in mDJNameElemList:
    mDJNamesList.append(aDJNameElem.find('a').text)

## Get DJ Popularity, ID from Spotify API
***
1. Import spotipy package
2. Use spotipy to set up App Credentials
3. Search for each artist
5. Import pandas package
4. Create DataFrame with Artist Details
***

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as mNP
import pandas as mPandasObj
from IPython.display import display
mSpotifyClient = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7f839484fcf4450db86dcc9126cec5e1",\
                                client_secret="80e9b5a936a44b2880354a719ef54019", requests_timeout = None))

In [None]:
mArtistList = []
for aArtistName in mDJNamesList:
    aSelectedArtist = {'id' : '','name' : '','popularity' : 0}
    aSearchResult = mSpotifyClient.search(q=aArtistName,type='artist',market='CA')
    for aArtist in aSearchResult['artists']['items']:
        if aSelectedArtist['popularity'] <= aArtist['popularity']:
            aSelectedArtist['id'] = aArtist['id']
            aSelectedArtist['name'] = aArtist['name']
            aSelectedArtist['popularity'] = aArtist['popularity']
    mArtistList.append(aSelectedArtist)
mArtistDataset = mPandasObj.DataFrame(mArtistList)
mArtistDataset.shape

## Get Top 150 songs of each Artist
***
1. Search for type track using artist name
2. Get ID, Name, Popularity of Track and concatenate into Artist DataFrame
3. Store each dataframe in list
4. Concatenate all dataframes into one dataset
***

In [None]:
mAllArtistDFList = []
for aArtist in mArtistList:
    aArtistDFList = []
    for aCurOffset in range(0,150,50):
        aTopSongSearchResult = mSpotifyClient.search(q=aArtist['name'],type='track',market='CA',offset=aCurOffset,limit=50)
        aTopSongDF =mPandasObj.DataFrame(aTopSongSearchResult['tracks']['items'],columns=['id','name','popularity'])
        aArtistDFList.append(aTopSongDF)
    aArtistTopSongDF = mPandasObj.concat(aArtistDFList)
    aArtistTopSongDF['artist_id'] = aArtist['id']
    aArtistTopSongDF['artist_name'] = aArtist['name']
    aArtistTopSongDF['artist_popularity'] = aArtist['popularity']
    mAllArtistDFList.append(aArtistTopSongDF)
mTopDJSongsDF = mPandasObj.concat(mAllArtistDFList)
mTopDJSongsDF.shape

## Get Audio Features of All songs in TopSongs DF
***
1. Use Audio Features function to get all audio features
2. Get 'id', 'key', 'mode', 'time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness' ,'liveness' ,'loudness' ,'speechiness' ,'valence' ,'tempo' ,'duration_ms' and concatenate into one DF
3. Store each dataframe in list
4. Concatenate all dataframes into one dataset
***

In [None]:
mAllArtistFeatureDFList = []
for aArtistDF in mAllArtistDFList:
    aIDList = aArtistDF[:100]['id'].tolist()
    aAudioFeatures = mSpotifyClient.audio_features(tracks = aIDList)
    aFeatureDictList = []
    for aAF in aAudioFeatures:
        if aAF is not None:
            aReqValF = {'id':aAF['id'],'key':aAF['key'],'mode':aAF['mode'],'time_signature':aAF['time_signature'],\
                        'acousticness':aAF['acousticness'], 'danceability':aAF['danceability'],'energy':aAF['energy'],\
                       'instrumentalness':aAF['instrumentalness'],'liveness':aAF['liveness'],'loudness':aAF['loudness'],\
                       'speechiness':aAF['speechiness'],'valence':aAF['valence'],'tempo':aAF['tempo'],'duration_ms':aAF['duration_ms']}
            aFeatureDictList.append(aReqValF)
    aIDList = aArtistDF[-50:]['id'].tolist()
    aAudioFeatures = mSpotifyClient.audio_features(tracks = aIDList)
    for aAF in aAudioFeatures:
        if aAF is not None:
            aReqValF = {'id':aAF['id'],'key':aAF['key'],'mode':aAF['mode'],'time_signature':aAF['time_signature'],\
                        'acousticness':aAF['acousticness'], 'danceability':aAF['danceability'],'energy':aAF['energy'],\
                       'instrumentalness':aAF['instrumentalness'],'liveness':aAF['liveness'],'loudness':aAF['loudness'],\
                       'speechiness':aAF['speechiness'],'valence':aAF['valence'],'tempo':aAF['tempo'],'duration_ms':aAF['duration_ms']}
            aFeatureDictList.append(aReqValF)
    aFeatureDF = mPandasObj.DataFrame(aFeatureDictList)
    mAllArtistFeatureDFList.append(aFeatureDF)
mSongFeaturesDF = mPandasObj.concat(mAllArtistFeatureDFList)
mSongFeaturesDF.shape

## Remove Songs without Audio Features
***
1. Songs that have missing audio features can't be used in Dataset so will be removed
***

In [None]:
mSongFeaturesDF.drop_duplicates(inplace=True,subset='id')
mTopDJSongsDF.drop_duplicates(inplace=True,subset='id')
mTopDJSongsDF = mPandasObj.merge(mTopDJSongsDF,mSongFeaturesDF, on = 'id')

In [None]:
mSongFeaturesDF.shape

In [None]:
mTopDJSongsDF.shape

## Store all DataFrames as CSV Files
***
1. Store all eight dataframes as CSV Files by using to_csv
2. These datasets will then be used in another file for EDA
3. After EDA and Model selection, the datasets will be broken down in Train, Test, Validate for Model
***

In [None]:

mTopDJSongsDF.to_csv('TopSongsMaster.csv', index=False)