In [2]:
# install/import libraries and packages
# !pip install lyricsgenius
import lyricsgenius as lg
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import os
import glob
import pymongo
import pprint
from pymongo import MongoClient

In [19]:
# set up client
client = MongoClient('mongodb://cradduhj:fsbbigdata@mongodb.fsb.miamioh.edu:27017', authSource="admin")

# input username
db = client['cradduhj']

# specify collection
LYRICS = db.LYRICS

In [4]:
# Genius API token
GENIUS_TOKEN = "Gaa5NHMiAV6Qex_aJUlY-b5cGcWLnC0CdCueEap_1ewq_o1g0OZVb1zThHXNBMx_"

# initialize Genius
genius = lg.Genius(
    GENIUS_TOKEN,
    skip_non_songs = True,
    excluded_terms = ["(Remix)", "(Live)"],
    remove_section_headers = True
)
genius.verbose = False

In [5]:
# clean artist name for data joining with lyrics
def clean_for_genius(text):
    text = re.sub(r'Featuring.*', '', text, flags = re.I)
    text = re.sub(r'With.*', '', text, flags = re.I)
    text = re.sub(r"[^A-Za-z0-9\s\-\(\)']", '', text)
    return text.strip()

In [6]:
# get song lyrics by matching title and artist and save to csv
def add_lyrics_to_csv(input_path, output_path = None, sleep_time = 0.5):
    # read in song titles and artists
    df = pd.read_csv(input_path)

    # ignore csvs that already have lyrics (computational efficiency)
    if "lyrics" in df.columns and df["lyrics"].notna().any():
        print(f"Skipping {input_path} — lyrics already exist.")
        return 0
    print(f"\n=== Processing {input_path} ===")
    
    # clean artist
    df["artist_clean"] = df["artist"].apply(clean_for_genius)

    # initialize lyrics list
    lyrics_list = []

    # get song lyrics for every song title and artist in the dataframe
    for i, row in df.iterrows():
        title = row["title"]
        artist = row["artist_clean"]
        print(f"[{i+1}/{len(df)}] Fetching lyrics for '{title}' by '{artist}'")

        try:
            song = genius.search_song(title, artist)
            lyrics_list.append(song.lyrics if song else None)
        except:
            lyrics_list.append(None)

        # delay requests
        time.sleep(sleep_time)

    # create lyrics column in dataframe
    df["lyrics"] = lyrics_list

    # overwrite csv with csv with lyrics
    if output_path is None:
        output_path = input_path

    df.to_csv(output_path, index=False)
    print(f"Saved → {output_path}")

    # return number of missing lyrics
    missing = df["lyrics"].isna().sum()
    print(f"Missing lyrics: {missing}/{len(df)}")
    return missing

In [None]:
# set data folder
folder = "billboard_data/*.csv"

# grab csvs in folder
years = sorted(glob.glob(folder))

# initialize missing
total_missing = 0

# get lyrics for all years of songs and get total missing lyrics
for year in years:
    total_missing += add_lyrics_to_csv(year, sleep_time=0.5)

print(f"\nTotal missing lyrics across ALL years: {total_missing}")

In [7]:
# get a unique list of all songs from the saved csvs
def combine_all_csvs(save_dir = "billboard_data"):
    all_files = glob.glob(os.path.join(save_dir, "billboard_hot100_*.csv"))
    if not all_files:
        print("No CSV files found.")
        return None
    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    df.drop_duplicates(subset=["title", "artist", "chart_year"], inplace=True)
    return df

In [22]:
# get dataframe of all unique songs
songs_df = combine_all_csvs()
save_dir = "billboard_data"
csv_path = os.path.join(save_dir, "all_songs2.csv")
songs_df.to_csv(csv_path, index = False)

In [23]:
songs_df.shape

(18959, 5)

In [24]:
# calculate number of rows with missing lyrics
songs_df.isna().any(axis=1).sum()

np.int64(1562)

In [25]:
# drop rows with missing lyrics (1562 out of 18959)
songs_df = songs_df.dropna()

In [26]:
songs_df.shape

(17397, 5)

In [27]:
songs_df.head()

Unnamed: 0,title,artist,chart_year,artist_clean,lyrics
1,Nobody's Supposed To Be Here,Deborah Cox,1999,Deborah Cox,How did you get here?\nNobody's supposed to be...
2,Lately,Divine,1999,Divine,Lately (I've been watchin' you)\nBeen thinkin'...
3,...Baby One More Time,Britney Spears,1999,Britney Spears,"Oh, baby, baby\nOh, baby, baby\n\nOh, baby, ba..."
4,Because Of You,98 Degrees,1999,98 Degrees,You're my sunshine after the rain\nYou're the ...
5,From This Moment On,Shania Twain,1999,Shania Twain,I do swear that I'll always be there\nI'd give...


In [28]:
# replace artist with cleaned version
songs_df["artist"] = songs_df["artist_clean"]
songs_df.drop(columns=["artist_clean"], inplace = True)

In [29]:
songs_df.head()

Unnamed: 0,title,artist,chart_year,lyrics
1,Nobody's Supposed To Be Here,Deborah Cox,1999,How did you get here?\nNobody's supposed to be...
2,Lately,Divine,1999,Lately (I've been watchin' you)\nBeen thinkin'...
3,...Baby One More Time,Britney Spears,1999,"Oh, baby, baby\nOh, baby, baby\n\nOh, baby, ba..."
4,Because Of You,98 Degrees,1999,You're my sunshine after the rain\nYou're the ...
5,From This Moment On,Shania Twain,1999,I do swear that I'll always be there\nI'd give...


In [20]:
# remove duplicates of lyrics
songs_df = songs_df.drop_duplicates(subset=['lyrics'])

In [22]:
# save dataframe of all songs with lyrics and cleaned artist name to csv
songs_df.to_csv("all_songs_clean_lyrics.csv", index = False)

In [23]:
# read in all songs with lyrics csv data
csv_file = "all_songs_clean_lyrics.csv"
df = pd.read_csv(csv_file)

# convert to list of dictionaries
data_dict = df.to_dict(orient='records')

# insert list of dictionaries into MongoDB
if data_dict:
    result = LYRICS.insert_many(data_dict)
    print(f"Inserted {len(result.inserted_ids)} records into LYRICS collection")
else:
    print("No data to insert")

Inserted 13873 records into LYRICS collection


In [3]:
test = pd.read_csv("all_songs_clean_lyrics.csv")
test.shape

(13873, 4)