///-------------------------------------------------------------------------------------------------<br>
// File: Dataset Preparation.ipynb<br>
//<br>
// Author: Dakshvir Singh Rehill<br>
// Date: 14/10/2020<br>
//<br>
// Summary:	This notebook is used to generate the dataset from Spotify API<br>
///-------------------------------------------------------------------------------------------------
***

## Get Songs from Spotify API
***
1. Import spotipy package
2. Use spotipy to set up App Credentials
3. Search for each artist
5. Import pandas package
4. Create DataFrame with Artist Details
***

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
import pandas as pd
from IPython.display import display
import lyricsgenius as lg
import nltk

In [2]:
words = set(nltk.corpus.words.words())
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7f839484fcf4450db86dcc9126cec5e1",\
                                client_secret="80e9b5a936a44b2880354a719ef54019", requests_timeout = None))
genius_connection = lg.Genius('KuT5Uj1s1SGxHfFQ7CuNNhIiV8K0P9_ZSdH76YfOh2-qflcWn-VJrY_YE8p9HPIA')

In [None]:
#get all possible genres and get songs of those genres
genres = sp.recommendation_genre_seeds()
genres = genres['genres']
tracks_by_genre = []
for genre in genres:
    tracks = sp.recommendations(seed_genres = [genre],country='CA',limit = 100)
    for track in tracks['tracks']:
        artist_list = []
        for artist_obj in track['artists']:
            artist_list.append(artist_obj['name'])
        track_dict = {'id' : track['id'] , 'name' : track['name'] , 'popularity' : track['popularity'], 'genre' : genre\
                     ,'album' : track['album']['name'], 'artists' : ';'.join(artist_list)}
        tracks_by_genre.append(track_dict)
tracks_by_genre_df = pd.DataFrame(tracks_by_genre)
tracks_by_genre_df.head()

In [None]:
tracks_by_genre_df.shape

In [None]:
song_features = []
track_ids = tracks_by_genre_df.id.tolist()
for i in range(0,len(track_ids) - 100,100):
    end_val = i + 100
    if end_val > len(track_ids):
        end_val = len(track_ids)
    audio_features_obj = sp.audio_features(tracks = track_ids[i:end_val])
    for audio_feature in audio_features_obj:
        if audio_feature is not None:
            features = {'id':audio_feature['id'],'key':audio_feature['key'],'mode':audio_feature['mode'],\
                        'time_signature':audio_feature['time_signature'],'acousticness':audio_feature['acousticness'],\
                        'danceability':audio_feature['danceability'],'energy':audio_feature['energy'],'instrumentalness':audio_feature['instrumentalness'],\
                        'liveness':audio_feature['liveness'],'loudness':audio_feature['loudness'],'speechiness':audio_feature['speechiness'],\
                        'valence':audio_feature['valence'],'tempo':audio_feature['tempo'],'duration_ms':audio_feature['duration_ms']}
            song_features.append(features)
song_features_df = pd.DataFrame(song_features)
song_features_df.shape

In [None]:
song_features_df.head()

## Remove Songs without Audio Features
***
1. Songs that have missing audio features can't be used in Dataset so will be removed
***

In [None]:
top_songs_df = pd.merge(tracks_by_genre_df,song_features_df, on = 'id')

In [None]:
top_songs_df.head()

In [None]:
top_songs_df.shape

In [None]:
unique_ids = top_songs_df.id.unique().tolist()
for id in unique_ids:
    index = top_songs_df.id == id
    if sum(index) == 1:
        continue
    genres = top_songs_df[index].genre.tolist()
    genres = [str(genre) for genre in genres]
    genre_val = ';'.join(genres)
    top_songs_df.loc[index,'genre'] = genre_val

In [None]:
top_songs_df.head()

In [None]:
top_songs_df.drop_duplicates(inplace=True)

In [3]:
top_songs_df = pd.read_csv('SongsByGenre.csv')

In [4]:
top_songs_df.shape

(10782, 19)

In [5]:
genius_connection.verbose = False
genius_connection.skip_non_songs = True
def get_lyrics(song,album):
    lyrics = np.nan
    try:
        song_data = genius_connection.search_song(song + " " + album)
    except:
        song_data = None
    if song_data is not None:
        lyrics = song_data.lyrics
        lyrics = " ".join(w for w in nltk.wordpunct_tokenize(lyrics) if w.lower() in words or not w.isalpha())
    return lyrics

In [6]:
get_lyrics(top_songs_df['name'].iloc[0],top_songs_df['album'].iloc[0]) #test

"[ Hook ] I swear this house I it come , I used to bounce through it Now every time my dad he stupid to me Found you in [ Verse 1 ] , that ’ s my , Small as , don ’ t deserve to be a township Around 2 : 00 in the afternoon town ’ s booming But no area or downtown to sit Why two ’ s , ? Too many safe ways home Born after Y2K , turns out we ’ re ‘ for anxiety on society and low [ Hook ] I swear this house I it come , I used to bounce through it Now every time my dad he stupid to me Round 2 , bitch [ Verse 2 ] but no personality North to south probably only a mile , B No , signal when Look between the margin you see that are urgent The perfect suburban town you ’ ever about Need to get around , I never learn the route Quite quiet , you wake up when the ’ shout Streets are empty , no play like down Never had a snow day , perpetual drought Precipitation , when rain never go out No on the couch and a maid for the house Need a gardener to ensure that the will sprout [ Hook ] I swear this hous

In [7]:
top_songs_df['lyrics'] = ''
for iX, row in top_songs_df.iterrows():
    top_songs_df.loc[iX,'lyrics'] = get_lyrics(row['name'],row['album'])
top_songs_df.isnull().sum()

Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5

id                     0
name                   0
popularity             0
genre                  0
album                  0
artists                0
key                    0
mode                   0
time_signature         0
acousticness           0
danceability           0
energy                 0
instrumentalness       0
liveness               0
loudness               0
speechiness            0
valence                0
tempo                  0
duration_ms            0
lyrics              3700
dtype: int64

In [9]:
#retry for missing values
for iX, rows in top_songs_df[top_songs_df.lyrics.isnull()].iterrows():
    top_songs_df.loc[iX,'lyrics'] = get_lyrics(row['name'],row['album'])

In [10]:
top_songs_df.isnull().sum()

id                     0
name                   0
popularity             0
genre                  0
album                  0
artists                0
key                    0
mode                   0
time_signature         0
acousticness           0
danceability           0
energy                 0
instrumentalness       0
liveness               0
loudness               0
speechiness            0
valence                0
tempo                  0
duration_ms            0
lyrics              3700
dtype: int64

In [11]:
top_songs_df.to_csv('SongsByGenre.csv', index=False)