///-------------------------------------------------------------------------------------------------<br>
// File: Dataset Preparation.ipynb<br>
//<br>
// Author: Dakshvir Singh Rehill<br>
// Date: 14/10/2020<br>
//<br>
// Summary:	This notebook is used to generate the dataset from Spotify API<br>
///-------------------------------------------------------------------------------------------------
***

## Get Songs from Spotify API
***
1. Import spotipy package
2. Use spotipy to set up App Credentials
3. Search for each artist
5. Import pandas package
4. Create DataFrame with Artist Details
***

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
import pandas as pd
from IPython.display import display
import lyricsgenius as lg
import nltk

In [2]:
nltk.download('words')
words = set(nltk.corpus.words.words())
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7f839484fcf4450db86dcc9126cec5e1",\
                                client_secret="80e9b5a936a44b2880354a719ef54019", requests_timeout = None))
genius_connection = lg.Genius('TsHjV_cvvYM8jJNUee7VMSF20TF2WqmJTK8NbxKZELXBRHKtR3oIB0Bc1QTsehGQ')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
#get all possible genres and get songs of those genres
genres = sp.recommendation_genre_seeds()
genres = genres['genres']
tracks_by_genre = []
for genre in genres:
    tracks = sp.recommendations(seed_genres = [genre],country='CA',limit = 100)
    for track in tracks['tracks']:
        artist_list = []
        for artist_obj in track['artists']:
            artist_list.append(artist_obj['name'])
        track_dict = {'id' : track['id'] , 'name' : track['name'] , 'popularity' : track['popularity'], 'genre' : genre\
                     ,'album' : track['album']['name'], 'artists' : ';'.join(artist_list)}
        tracks_by_genre.append(track_dict)
tracks_by_genre_df = pd.DataFrame(tracks_by_genre)
tracks_by_genre_df.head()

Unnamed: 0,id,name,popularity,genre,album,artists
0,7wj1Lb5ggzMWJ9TqZJ2dwo,Up,54,acoustic,My World 2.0,Justin Bieber
1,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic,Innan jag kände dig,Melissa Horn
2,49T9pCYlpvtGd9ugZt7DSZ,If I Could,53,acoustic,In Between Dreams,Jack Johnson
3,0TJWjCuRZLRrBSZLBIF7EW,Up,45,acoustic,The Awakening,James Morrison;Jessie J
4,1EAgPzRbK9YmdOESSMUm6P,Home,55,acoustic,The World From The Side Of The Moon,Phillip Phillips


In [4]:
tracks_by_genre_df.shape

(12363, 6)

In [5]:
song_features = []
track_ids = tracks_by_genre_df.id.tolist()
for i in range(0,len(track_ids) - 100,100):
    end_val = i + 100
    if end_val > len(track_ids):
        end_val = len(track_ids)
    audio_features_obj = sp.audio_features(tracks = track_ids[i:end_val])
    for audio_feature in audio_features_obj:
        if audio_feature is not None:
            features = {'id':audio_feature['id'],'key':audio_feature['key'],'mode':audio_feature['mode'],\
                        'time_signature':audio_feature['time_signature'],'acousticness':audio_feature['acousticness'],\
                        'danceability':audio_feature['danceability'],'energy':audio_feature['energy'],'instrumentalness':audio_feature['instrumentalness'],\
                        'liveness':audio_feature['liveness'],'loudness':audio_feature['loudness'],'speechiness':audio_feature['speechiness'],\
                        'valence':audio_feature['valence'],'tempo':audio_feature['tempo'],'duration_ms':audio_feature['duration_ms']}
            song_features.append(features)
song_features_df = pd.DataFrame(song_features)
song_features_df.shape

(12300, 14)

In [6]:
song_features_df.head()

Unnamed: 0,id,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,duration_ms
0,7wj1Lb5ggzMWJ9TqZJ2dwo,0,1,4,0.084,0.711,0.717,0.0,0.09,-4.317,0.0396,0.589,125.949,234933
1,5SdG78xwNRsjXmFGhm9Z8D,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
2,49T9pCYlpvtGd9ugZt7DSZ,9,1,4,0.762,0.714,0.321,0.385,0.0993,-14.517,0.0428,0.477,114.228,144987
3,0TJWjCuRZLRrBSZLBIF7EW,11,0,4,0.425,0.437,0.632,0.0,0.445,-6.781,0.0339,0.332,72.845,218411
4,1EAgPzRbK9YmdOESSMUm6P,0,1,4,0.0256,0.606,0.826,1.6e-05,0.117,-6.04,0.0307,0.322,121.04,210173


## Remove Songs without Audio Features
***
1. Songs that have missing audio features can't be used in Dataset so will be removed
***

In [7]:
top_songs_df = pd.merge(tracks_by_genre_df,song_features_df, on = 'id')

In [8]:
top_songs_df.head()

Unnamed: 0,id,name,popularity,genre,album,artists,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,duration_ms
0,7wj1Lb5ggzMWJ9TqZJ2dwo,Up,54,acoustic,My World 2.0,Justin Bieber,0,1,4,0.084,0.711,0.717,0.0,0.09,-4.317,0.0396,0.589,125.949,234933
1,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
2,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
3,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,swedish,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
4,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,swedish,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000


In [9]:
top_songs_df.shape

(18124, 19)

In [10]:
unique_ids = top_songs_df.id.unique().tolist()
for id in unique_ids:
    index = top_songs_df.id == id
    if sum(index) == 1:
        continue
    genres = top_songs_df[index].genre.tolist()
    genres = [str(genre) for genre in genres]
    genre_val = ';'.join(genres)
    top_songs_df.loc[index,'genre'] = genre_val

In [11]:
top_songs_df.head()

Unnamed: 0,id,name,popularity,genre,album,artists,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,duration_ms
0,7wj1Lb5ggzMWJ9TqZJ2dwo,Up,54,acoustic,My World 2.0,Justin Bieber,0,1,4,0.084,0.711,0.717,0.0,0.09,-4.317,0.0396,0.589,125.949,234933
1,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic;acoustic;swedish;swedish,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
2,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic;acoustic;swedish;swedish,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
3,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic;acoustic;swedish;swedish,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000
4,5SdG78xwNRsjXmFGhm9Z8D,Om du letar efter nån,36,acoustic;acoustic;swedish;swedish,Innan jag kände dig,Melissa Horn,10,1,4,0.81,0.556,0.284,9.5e-05,0.187,-15.177,0.0302,0.658,130.009,245000


In [12]:
top_songs_df.drop_duplicates(inplace=True)

In [13]:
top_songs_df.shape

(10782, 19)

In [27]:
genius_connection.verbose = False
genius_connection.skip_non_songs = True
def add_lyrics(row):
    try:
        song_data = genius_connection.search_song(row['name'] + " " + row['album'])
    except:
        song_data = None
    if song_data is None:
        row['lyrics'] = np.nan
    else:
        row['lyrics'] = song_data.lyrics
        row['lyrics'] = " ".join(w for w in nltk.wordpunct_tokenize(row['lyrics']) if w.lower() in words or not w.isalpha())

In [28]:
top_songs_df['lyrics'] = ''
top_songs_df.apply (lambda row: add_lyrics(row), axis=1)

Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (re

0        None
1        None
5        None
9        None
10       None
         ... 
18119    None
18120    None
18121    None
18122    None
18123    None
Length: 10782, dtype: object

In [None]:
top_songs_df.to_csv('SongsByGenre.csv', index=False)

In [None]:
top_songs_df.isnull().sum()