# Using Spotify and Genius APIs to Pull out Lyrics for Top Songs Based on Streaming Popularity

In [10]:
# Import dependencies
import requests
import numpy as np
import pandas as pd
from config import auth, genius_bearer_token
import json
from bs4 import BeautifulSoup
import re

## Working with Spotify API

In [2]:
# Set authorization header using encoded spotify authentication
headers = {'Authorization': 'Basic ' + auth}

# Set data for making call to spotify
data = [('grant_type', 'client_credentials')]

# Make post request to get token
response = requests.post('https://accounts.spotify.com/api/token', headers = headers, data = data).json()

token = response['access_token']

# Set authorization header using token
headers = {'Authorization': 'Bearer ' + token}

In [3]:
# Spotify endpoint to get all "categories" // moods
mood_endpoint = "https://api.spotify.com/v1/browse/categories"

# Make get request
mood_response = requests.get(mood_endpoint, headers = headers).json()

# I remember us mentioning possibly looking at moods, so while
# I'm not using this downstream, I figured I would leave it to
# build on later if we have the opportunity.

In [4]:
# Get all playlist names within spotify "Top List" category
toplist_endpoint = "https://api.spotify.com/v1/browse/categories/toplists/playlists"

# Make get request
toplist_response = requests.get(toplist_endpoint, headers = headers).json()

# Make variable to shorten typing downstream
playlists = toplist_response['playlists']['items']

# Collect track lists for each playlist
toplists_tracks = {}

# for each number in array of digits 0 - number of playlists
for i in np.arange(len(playlists)):
    # URL to fetch spotify track ids for songs in each playlist
    tracklist_endpoint = playlists[i]['tracks']['href']

    # Make get request
    tracklist_response = requests.get(tracklist_endpoint, headers = headers).json()
    
    # Collect track ids
    track_id = []
    
    # for each number in array of digits 0 - number of tracks in playlist
    for q in np.arange(len(tracklist_response['items'])):
        # Append track id to list
        track_id.append(tracklist_response['items'][q]['track']['id'])
    
    # Add dictionary entry of playlist with value of list of track ids
    toplists_tracks[playlists[i]['name']] = track_id

In [5]:
# Get all song data for songs within toplists_tracks
playlists_data = []
for key, value in toplists_tracks.items():
    
    # Initialize lists
    playlist = key
    track_name = []
    artist_name = []
    duration = []
    popularity = []
    instrumentalness = []
    
    # For each number in array of digits 0 - number of tracks in track list 
    for i in np.arange(len(value)):
        # Paste together track data endpoint for each track
        track_endpoint = f"https://api.spotify.com/v1/tracks/{value[i]}"
        
        # Make get request
        track_response = requests.get(track_endpoint, headers = headers).json()
        
        try:
            # Extract data to lists
            track_name.append(track_response['name'])
            artist_name.append(track_response['artists'][0]['name'])  # Only considering primary artist
            duration.append(track_response['duration_ms'])
            popularity.append(track_response['popularity'])
        except:
            track_name.append("NaN")
            artist_name.append("NaN")
            duration.append("NaN")
            popularity.append("NaN")
        
        # Paste together audio features endpoint for each track
        audio_endpoint = f"https://api.spotify.com/v1/audio-features/{value[i]}"
        
        try:
            # Make get request
            audio_response = requests.get(audio_endpoint, headers = headers).json()
            instrumentalness.append(audio_response['instrumentalness'])
            
        except KeyError:
            #print(f"No audio features page for track {track_response['name']} by {track_response['artists'][0]['name']}")
            instrumentalness.append("NaN")
        
    # Gather information into dictionary and append that dictionary to playlists_data
    track_dict = {"Playlist": playlist,
                  "Track Name": track_name,
                  "Artist Name": artist_name,
                  "Duration (ms)": duration,
                  "Popularity": popularity,
                  "Instrumentalness": instrumentalness}
    playlists_data.append(track_dict)

In [6]:
# Dataframes by playlist
today_top_hits = pd.DataFrame(playlists_data[0])
rap_caviar = pd.DataFrame(playlists_data[1])
mint = pd.DataFrame(playlists_data[2])
rock_this = pd.DataFrame(playlists_data[3])
are_and_be = pd.DataFrame(playlists_data[4])
viva_latino = pd.DataFrame(playlists_data[5])
hot_country = pd.DataFrame(playlists_data[6])
new_music_friday_malaysia = pd.DataFrame(playlists_data[7])
viral_hits = pd.DataFrame(playlists_data[8])
fresh_finds = pd.DataFrame(playlists_data[9])
global_top_50 = pd.DataFrame(playlists_data[10])
united_states_top_50 = pd.DataFrame(playlists_data[11])
global_viral_50 = pd.DataFrame(playlists_data[12])
united_states_viral_50 = pd.DataFrame(playlists_data[13])

In [7]:
today_top_hits.head()

Unnamed: 0,Playlist,Track Name,Artist Name,Duration (ms),Popularity,Instrumentalness
0,Today's Top Hits,Happier,Marshmello,214289,89,0.0
1,Today's Top Hits,Eastside (with Halsey & Khalid),benny blanco,173799,95,0.0
2,Today's Top Hits,Be Alright,Dean Lewis,196373,90,0.0
3,Today's Top Hits,Whenever (feat. Conor Maynard),Kris Kross Amsterdam,163636,69,0.0
4,Today's Top Hits,Promises (with Sam Smith),Calvin Harris,213309,90,5e-06


## Working with Genius API

In [20]:
# define the url parameters for the API request
genius_base_url = "http://api.genius.com"
genius_headers = {'Authorization': 'Bearer ' + genius_bearer_token}
genius_search_url = genius_base_url + "/search"

# define path to store lyrics at
lyric_base_path = "../Data_Files/Song_Lyrics"

# create list to store all dataframes of playlists
playlists = [today_top_hits, rap_caviar, mint, rock_this, are_and_be, 
             viva_latino, hot_country, new_music_friday_malaysia,
             viral_hits, fresh_finds, global_top_50, united_states_top_50,
             global_viral_50, united_states_viral_50]
playlists_names = ["today_top_hits", "rap_caviar", "mint", "rock_this", "are_and_be", 
             "viva_latino", "hot_country", "new_music_friday_malaysia",
             "viral_hits", "fresh_finds", "global_top_50", "united_states_top_50",
             "global_viral_50", "united_states_viral_50"]
playlist_counter = 0

In [21]:
# loop through all dataframes of playlists to store song lyrics
for playlist in playlists:

    # create new column in play list data frame for link to song lyric .txt file
    playlist["Lyrics Path"] = ""
    # define storage path to lyric .txt files
    playlist_lyrics_path = lyric_base_path + "/" + str(playlists_names[playlist_counter]) + "/"

    # loop through each row in the data frame
    for index, row in playlist.iterrows():

        # search for a test song
        song_title = row["Track Name"]
        artist_name = row["Artist Name"]
        genius_data = {'q': song_title + ' ' + artist_name}
        genius_response = requests.get(genius_search_url, params=genius_data, headers=genius_headers)
        genius_json = genius_response.json()

        # loop through hit in json request to pull out song url
        for hit in genius_json["response"]["hits"]:
            if hit["result"]["primary_artist"]["name"] == artist_name:
                song_url = hit["result"]["url"]
                break

        # use beautiful soup to crawl found song url and store the lyrics
        song_page = requests.get(song_url)
        song_html = BeautifulSoup(song_page.text, 'html.parser')
        song_lyrics = song_html.find('div', class_='lyrics').get_text()

        # store the lyrics in a .txt file and save to the project folder
        text_file_name = (song_title + "_" + artist_name)
        text_file_name = re.sub(r"[^a-zA-Z0-9_]+", '', text_file_name)
        txt_file = open(playlist_lyrics_path + text_file_name + ".txt", "w") 
        txt_file.write(song_lyrics) 
        txt_file.close()

        # store the path to the file in the dataframe
        playlist["Lyrics Path"][index] = (playlist_lyrics_path + text_file_name + ".txt")

    # save the dataframe as a csv file
    file_name = str(playlists_names[playlist_counter])
    file_name = re.sub(r"[^a-zA-Z0-9_]+", '', file_name)
    playlist.to_csv("../Data_Files/Play_Lists/" + file_name + ".csv")
    
    playlist_counter += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
