### extract_lyrics.ipynb

In [1]:
import os 
import sys
import numpy as np 
import pandas as pd
import requests
import base64
import time
import json
import csv
from requests.exceptions import ConnectionError

from glob import glob

#### First, set up the lyrics API

In [2]:
# Import the genius package, and set up the lyrics extractor
import lyricsgenius

genius = lyricsgenius.Genius("88toHQ8Vo6klVNei3gUq6KoOo4ccuhGeVk7JKourVQNxsZM52_SHoZRXJzHpmW31", 
    remove_section_headers=True,
    skip_non_songs=True,
    excluded_terms=["(Remix)", "(Live)"],
    sleep_time=3,
    timeout=10, retries=3)  # space out calls to avoid rate‑limit)

In [3]:
currPath = os.getcwd()
dataPath = os.path.abspath(os.path.join(currPath, os.pardir, 'data/playlist_subset'))
playlists = os.listdir(dataPath); playlists.sort() 
playlists = [x for x in playlists if not x.startswith('.')]; playlists = [x for x in playlists if x.endswith('gz')]

In [4]:
lyrics_dir = os.path.join(dataPath, 'lyrics')
os.makedirs(lyrics_dir, exist_ok=True)

#### Determine which songs we will extract
This will be based on 1) the most frequent songs and 2) the songs we have already extracted. 

In [14]:
# Read in the frequency document 
song_freq = glob(os.path.join(dataPath, 'freq/*frequency*')); assert len(song_freq) == 1
song_freq = pd.read_csv(song_freq[0])

# Determine which songs are present in 20 or more playlists 
song_freq_more = song_freq[song_freq['total'] > 19]


In [21]:
# Then, import the songs that we already have tried to find lyrics for...
lyrics_already = glob(os.path.join(dataPath, 'lyrics/playlist*')); assert len(lyrics_already) == 1
lyrics_already = pd.read_csv(lyrics_already[0], header=None)
lyrics_already_uri = lyrics_already.loc[:, 0].values


In [36]:
# Then, remove the songs we already have from our song_freq_more list 
for l in lyrics_already_uri: 
    song_freq_more = song_freq_more[song_freq_more['track_uri'] != l]


#### Then, find the remaining songs we need...

In [79]:
# For this track...
for t in song_freq_more['track_uri'].values[:1]: 
    # Grab the song title and artist
    song_title = song_freq_more[song_freq_more['track_uri'] == t]['track_name'].values[0]
    artist = song_freq_more[song_freq_more['track_uri'] == t]['artist_name'].values[0]

    # Output file for this playlist
    output_file = os.path.join(lyrics_dir, 'playlist_subset_1_lyrics.csv')

    # Clean song title if necessary...     
    # Clear out anything after a dash
    dash_idx = [song_title.split().index(x) for x in song_title.split() if '-' in x]
    if dash_idx: 
        song_title = song_title.split()[:dash_idx[0]]
        song_title = (' ').join(song_title)

    # Clear out anything after featuring
    feature_idx = [song_title.split().index(x) for x in song_title.split() if 'featuring' in x]
    if feature_idx: 
        song_title = song_title.split()[:feature_idx[0]]
        song_title = (' ').join(song_title)

    # Clear out anything after duet
    duet_idx = [song_title.split().index(x) for x in song_title.split() if 'duet' in x]
    if duet_idx: 
        song_title = song_title.split()[:duet_idx[0]]
        song_title = (' ').join(song_title)

    # Then, get the song lyrics 
    for attempt in range(3):
        try:
            found_lyrics = genius.search_song(song_title, artist)
            break  # Success
        except ConnectionError as e:
            print(f"Connection error on attempt {attempt + 1}: {e}")
            time.sleep(2 ** attempt)  # exponential backoff
        except Exception as e:
            print(f"Other error: {e}")
            break
    
    # If we were able to find the song lyrics 
    if found_lyrics: 
        # First, clean the string
        lyrics_split = found_lyrics.lyrics.split(' ')

        # Find the index that contains the word "lyrics"
        lyrics_idx = [lyrics_split.index(i) for i in lyrics_split if i.startswith('Lyrics')]

        # Check whether it is an annotation or lyrics 
        check_lyric_str = lyrics_split[lyrics_idx[0]]

        # Create a list that contains a cleaned string of song lyrics
        song_lyric_final = []

        # If there is no song annotation...
        if '\n' in check_lyric_str: 
            song_lyric = lyrics_split[lyrics_idx[0]:] # get the actual lyrics 
            song_lyric[0] = song_lyric[0].split('\n')[1:][0] # get rid of unnecessary characters for first word 
            song_lyric = " ".join(song_lyric) # place lyrics in one string 
            song_lyric = song_lyric.replace('\n', ' ') # get rid of unnecessary characters for our entire lyrics string 

        # If there is a song annotation... 
        elif len([lyrics_split.index(i) for i in lyrics_split if '\xa0' in i]):
            # Get the actual lyrics 
            lyrics_idx = [lyrics_split.index(i) for i in lyrics_split if '\xa0' in i]
            song_lyric = lyrics_split[lyrics_idx[0]:]

            # Place all the words together in one string 
            song_lyric = " ".join(song_lyric)

            # Then, get rid of certain strings 
            song_lyric = song_lyric.replace('More\xa0\n', '')
            song_lyric = song_lyric.replace('\n', ' ')
            
        # If there isn't a '/n' next to "Lyric", so we just remove the word "Lyric"
        else: 
            song_lyric = lyrics_split[lyrics_idx[0]:] # get the actual lyrics 
            song_lyric[0] = check_lyric_str[6:]
            song_lyric = " ".join(song_lyric) # place lyrics in one string 
            song_lyric = song_lyric.replace('\n', ' ') # get rid of unnecessary characters for our entire lyrics string 
        

        # Define our flag
        found_flag = 1

    # If we couldn't find the lyrics....
    else: 
        found_flag = 0
        song_lyric = None

    print(list(song_freq_more['track_uri'].values).index(t), '/', len(song_freq_more['track_uri'].values))
        
    # Write this row to the output CSV
    with open(output_file, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["track_uri", "track_name", "artist_name", "lyrics", "found_lyrics"])
        writer.writerow({
            "track_uri": t,
            "track_name": song_title,
            "artist_name": artist,
            "lyrics": song_lyric,
            "found_lyrics": found_flag, 
            
        })

    print(f"{list(song_freq_more['track_uri'].values).index(t)} / {len(song_freq_more['track_uri'].values)} saved to {os.path.basename(output_file)}")


Searching for "HUMBLE." by Kendrick Lamar...
Done.
Got to this new second place
0 / 8515 saved to playlist_subset_1_lyrics.csv


  song = Song(self, song_info, lyrics)


In [None]:
############ OLD
# # For each set of playlists...
# for p in playlists[:1]: 
#     p_data = pd.read_csv(os.path.join(dataPath, p), compression='gzip')

#     # Then, get the unique tracks that are in this dataset
#     tracks = np.unique(p_data['track_uri'].values)

#     # Then, for each unique track, get the lyrics...
#     for t in tracks[6200:]: 
#         curr_track = p_data[p_data['track_uri'] == t] # current track 
#         curr_track_idx = curr_track.index.values # get current index

#         # Grab song title and artist
#         song_title = curr_track['track_name'].values[0]
#         artist = curr_track['artist_name'].values[0]   
        
#         # Output file for this playlist
#         output_file = os.path.join(lyrics_dir, p.replace('.csv.gz', '_lyrics.csv'))

#         # Clean song title if necessary...     
#         # Clear out anything after a dash
#         dash_idx = [song_title.split().index(x) for x in song_title.split() if '-' in x]
#         if dash_idx: 
#             song_title = song_title.split()[:dash_idx[0]]
#             song_title = (' ').join(song_title)

#         # Clear out anything after featuring
#         feature_idx = [song_title.split().index(x) for x in song_title.split() if 'featuring' in x]
#         if feature_idx: 
#             song_title = song_title.split()[:feature_idx[0]]
#             song_title = (' ').join(song_title)

#         # Clear out anything after duet
#         duet_idx = [song_title.split().index(x) for x in song_title.split() if 'duet' in x]
#         if duet_idx: 
#             song_title = song_title.split()[:duet_idx[0]]
#             song_title = (' ').join(song_title)

#         # Then, get the song lyrics 
#         for attempt in range(3):
#             try:
#                 found_lyrics = genius.search_song(song_title, artist)
#                 break  # Success
#             except ConnectionError as e:
#                 print(f"Connection error on attempt {attempt + 1}: {e}")
#                 time.sleep(2 ** attempt)  # exponential backoff
#             except Exception as e:
#                 print(f"Other error: {e}")
#                 break
        
#         # If we were able to find the song lyrics 
#         if found_lyrics: 
#             # First, clean the string
#             lyrics_split = found_lyrics.lyrics.split(' ')

#             # Find the index that contains the word "lyrics"
#             lyrics_idx = [lyrics_split.index(i) for i in lyrics_split if i.startswith('Lyrics')]

#             # Check whether it is an annotation or lyrics 
#             check_lyric_str = lyrics_split[lyrics_idx[0]]

#             # Create a list that contains a cleaned string of song lyrics
#             song_lyric_final = []

#             # If there is no song annotation...
#             if '\n' in check_lyric_str: 
#                 song_lyric = lyrics_split[lyrics_idx[0]:] # get the actual lyrics 
#                 song_lyric[0] = song_lyric[0].split('\n')[1:][0] # get rid of unnecessary characters for first word 
#                 song_lyric = " ".join(song_lyric) # place lyrics in one string 
#                 song_lyric = song_lyric.replace('\n', ' ') # get rid of unnecessary characters for our entire lyrics string 
                
#             elif not '\n' in check_lyric_str: 
#                 song_lyric = lyrics_split[lyrics_idx[0]:] # get the actual lyrics 
#                 song_lyric[0] = check_lyric_str[6:]
#                 song_lyric = " ".join(song_lyric) # place lyrics in one string 
#                 song_lyric = song_lyric.replace('\n', ' ') # get rid of unnecessary characters for our entire lyrics string 
                
#             # If there is a song annotation...
#             else: 
#                 # Get the actual lyrics 
#                 lyrics_idx = [lyrics_split.index(i) for i in lyrics_split if '\xa0' in i]
#                 song_lyric = lyrics_split[lyrics_idx[0]:]

#                 # Place all the words together in one string 
#                 song_lyric = " ".join(song_lyric)

#                 # Then, get rid of certain strings 
#                 song_lyric = song_lyric.replace('More\xa0\n', '')
#                 song_lyric = song_lyric.replace('\n', ' ')

#             # Place the lyrics into our dataframe accordingly
#             for song_idx in curr_track_idx: 
#                 found_flag = 1

#         # If we couldn't find the lyrics....
#         else: 
#             found_flag = 0
#             song_lyric = None

#         print(list(tracks).index(t), '/', len(tracks))
        
#         # Write this row to the output CSV
#         with open(output_file, 'a', newline='', encoding='utf-8') as f:
#             writer = csv.DictWriter(f, fieldnames=["track_uri", "track_name", "artist_name", "lyrics", "found_lyrics"])
#             writer.writerow({
#                 "track_uri": t,
#                 "track_name": song_title,
#                 "artist_name": artist,
#                 "lyrics": song_lyric,
#                 "found_lyrics": found_flag, 
                
#             })

#         print(f"{list(tracks).index(t)} / {len(tracks)} saved to {os.path.basename(output_file)}")
#         # time.sleep(5)

#     # # Then, save this dataframe accordingly
#     # os.makedirs(os.path.join(dataPath, 'lyrics'), exist_ok=True)
#     # p_data.to_csv(os.path.join(dataPath, 'lyrics', p.replace('.csv.gz', '_lyrics.csv.gz')), index=False, compression='gzip')

    