# scrape lyrics data from genius

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import lyricsgenius
from time import sleep
import re
import sys
import numpy as np
from requests.exceptions import Timeout
from random import randint
%matplotlib inline

## genius references

1. [Get Genius client access token from here](https://genius.com/api-clients)
2. [Handling Genius Connection Timeouts](https://github.com/johnwmillr/LyricsGenius/issues/121)

In [None]:
#genius client access token
client_access_token = "EKY8j6LYR9VzltQlUNhRyseucjnSI2fBgkipKCPbPgZwC7ZPx3hN3xNlaUW6ewlv"
#timeout and sleep can be adjusted
genius = lyricsgenius.Genius(client_access_token, timeout=2, sleep_time=5)

In [None]:
#input file with spotify data dump
filename = "/Users/gautham/Downloads/playlist_tracks.csv"
#local path for download
path = "/Users/gautham/Documents/Documents - gBookPro/Berkeley MIMS/Semester 1/256 - ANLP/anlp21-project/test_dwn/"

In [None]:
#import data frame and create lyrics column
lyrics_df = pd.read_csv(filename)
lyrics_df.drop('Unnamed: 0', inplace=True, axis=1)
lyrics_df['lyrics'] = ""

In [None]:
#function to clean title using regexes to improve Genius API search result accuracy
def clean_title(title):
    
    #remove remaster
    if re.search(r'^.+?(?=-)',title):
        title_cln = re.match(r'^.+?(?=-)',title).group(0)
        return title_cln
    elif re.search(r'^.+?(?=\x28)',title):
        title_cln = re.match(r'^.+?(?=\x28)',title).group(0)
        return title_cln
    else:
        return title

In [None]:
lyrics_df.head()

## approach 1 - using `df.iterrows`

`df.iterrows` does not allow us to [mutate the df using row](https://stackoverflow.com/questions/31458794/python-using-iterrows-to-create-columns) so lyrics for each song are saved to a local file indexed by spotify's unique `track_id` field to be read in later.

In [None]:
#iterate through all rows using iterrows()
for idx, row in lyrics_df.iterrows():
    print("Handling row {}".format(str(idx)))
    
    #get track_id for indexing each txt file created
    track_id = str(row['track_id'])
    #grab title and clean using helper function
    title_   = str(clean_title(row['track_name']))
    #grab artist name
    artist_  = str(row['track_artist_name'])
    
    #5 retries in case of exceptions
    retries = 0
    while retries < 5:
        try:
            #search Genius API for song and create a song object
            song = genius.search_song(title_,artist_)
        #handle timeout and connection errors
        except (Timeout, ConnectionError) as e:
            retries += 1
            print("Retry {} for song {}".format(str(retries),str(title_)))
            continue
        
        #if a song was retrieved, get lyrics text for it
        if song is not None:
            text_lyrics = song.to_text(sanitize=True)
            
            #this doesn't work, see StackOverflow article cited above
            #row['lyrics'] = text_lyrics
            
            #write lyrics to file indexed by track_id
            with open(path + track_id + '.txt', "w") as file:
                file.write(text_lyrics) 
                file.close()
        
        #this codeblock is not required since df cannot be mutated using iterrows
        #else:
            #row['lyrics'] = None
        
        break

## approach 2 - using `df.[column_name][idx]` indexing

`df.[column_name][idx]` indexing allowed us to mutate the df but raised this [warning message](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy)

In [None]:
#iterate through all rows using iterrows()
for i in range(len(lyrics_df)):
    print("Handling row {}".format(str(i+1)))
        
    #get track_id for indexing each txt file created
    track_id = str(lyrics_df['track_id'][i])
    #grab title and clean using helper function
    title_   = clean_title(lyrics_df['track_name'][i])
    #grab artist name
    artist_  = lyrics_df['track_artist_name'][i]
    
    #5 retries in case of exceptions
    retries = 0
    while retries < 5:
        try:
            #search Genius API for song and create a song object
            song = genius.search_song(title_,artist_)
        #handle timeout and connection errors
        except (Timeout, ConnectionError) as e:
            print("Retry {} for song {}".format(str(retries),str(title_)))
            retries += 1
            continue
        
        #if a song was retrieved, get lyrics text for it
        if song is not None:
            text_lyrics = song.to_text(sanitize=True)
            #mutate df, add lyrics 
            lyrics_df['lyrics'][i] = text_lyrics
            #write lyrics to file indexed by track_id
            with open(path + track_id + '.txt', "w") as file:
                file.write(text_lyrics) 
                file.close()
        #if no lyrics retrieved, add None to corresponding row
        else:
            lyrics_df['lyrics'][i] = None
        
        break

In [None]:
#lyrics_df.head()