# 🎧 NB02: Data Processing

In [1]:
# Importing the necessary libraries
from dotenv import load_dotenv
from functions import *
from bs4 import BeautifulSoup
from pprint import pprint
from auth import *
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from pandas import json_normalize
import pandas as pd
import json
import sqlite3
import matplotlib.pyplot as plt

In [2]:
# Defining the function as "access_token" for later use
access_token = get_token()

### Step 1: Cleaning and Manipulating Playlist Data 
- Reading the top hits playlist and "girly" pop playlist JSON files
- Normalizing the data and removing duplicate rows
- Creating a final combined dataframe of both playlist rows.

In [None]:
# Using the clean_playlist function to clean the data from the raw data files
top_hits = clean_playlist('../data/raw/combined_top_hits.json')
women_pop = clean_playlist('../data/raw/combined_women_pop.json')

### Step 2: Retreiving top 50 artists
I will be retrieving the top 50 artists whose songs feature the most number of times in the combined playlist dataframe. 


In [None]:
# Combining clean JSON playlists into a single dataframe, dropping duplicates, and saving it as CSV
playlists_clean = pd.concat([top_hits, women_pop]).drop_duplicates()

playlists_clean.to_csv('../data/processed/playlists_clean.csv', index=False)

playlists_clean = pd.read_csv('../data/processed/playlists_clean.csv')

In [None]:
# Retrieving the 50 artists with the most songs in the playlists
top_50_artists = (
    playlists_clean['Artists']
    .value_counts()
    .head(50)
    .index.tolist()
)

Looking at the dataframe, these are the artists that I have chosen to include in my analysis:

| Male Artists | Female Arists |
| :--: | :--: |
| Justin Bieber | Rihanna |
| Bruno Mars | Ariana Grande |
| Ed Sheeran | Taylor Swift |
| Flo Rida | Nicki Minaj |
| Pharrell Williams | Britney Spears |

I chose these artists based on:
1) Gender - I wanted 5 male and 5 female artists
2) Pop artist status - To avoid overcomplicating and interfering the data analysis process, I decided to only choose pop artists (including Flo Rida and Nicki Minaj, who are also rap artists but constantly contribute to/feature on pop tracks)
3) Relevancy - I chose artists that I think are relevant, popular, and make or have made great contributions to defining 2000s/modern pop music

### Step 3: Retrieving top tracks and their lyrics

In [None]:
# Defining artist names and IDs
artists_info = {
    "Justin Bieber": "1uNFoZAHBGtllmzznpCI3s",
    "Bruno Mars": "0du5cEVh5yTK9QJze8zA0C",
    "Ed Sheeran": "6eUKZXaKkcviH0Ku9w2n3V",
    "Flo Rida": "0jnsk9HBra6NMjO2oANoPY",
    "Pharrell Williams": "2RdwBSPQiwcmiDo9kixcl8",
    "Rihanna": "5pKCCKE2ajJHZ9KAiaK11H",
    "Ariana Grande": "66CXWjxzNUsdJxJ2JdwvnR",
    "Taylor Swift": "06HL4z0CvFAxyc27GXpf02",
    "Nicki Minaj": "0hCNtLu0JehylgoiP8L4Gh",
    "Britney Spears": "26dSoYclwsYLMAKD3tpOr4"
}

In [None]:
# Retrieving the top tracks for each artist using the get_top_tracks function and Spotify API endpoint
top_tracks_df = pd.concat(
    [pd.DataFrame(get_top_tracks(artist_id, access_token)['tracks']).assign(artist=artist_name)
     for artist_name, artist_id in artists_info.items()],
    ignore_index=True
)

In [None]:
# Only keeping the necessary columns Track Name and Artist
combined_tracks = top_tracks_df[['name', 'artist']].copy()

# Adding gender mapping, 0 for male and 1 for female
gender_mapping = {
    'Justin Bieber': 0,  
    'Bruno Mars': 0,
    'Ed Sheeran': 0,
    'Flo Rida': 0,
    'Pharrell Williams': 0,
    'Rihanna': 1,       
    'Ariana Grande': 1,
    'Taylor Swift': 1,
    'Nicki Minaj': 1,
    'Britney Spears': 1
}

combined_tracks.loc[:, 'gender'] = combined_tracks['artist'].map(gender_mapping)

combined_tracks.to_csv('../data/processed/top_tracks.csv', index=False)

# Saving the final top tracks as CSV
top_tracks = pd.read_csv('../data/processed/top_tracks.csv')

In [9]:
# Using function fetch_lyrics to get the lyrics for each song
top_tracks['lyrics'] = top_tracks.apply(fetch_lyrics, axis=1)

# Saving the raw lyrics data
top_tracks.to_csv('../data/raw/lyrics_df.csv', index=False)

Searching for "STAY (with Justin Bieber)" by Justin Bieber...
Done.
Searching for "Ghost" by Justin Bieber...
Done.
Searching for "Love Yourself" by Justin Bieber...
Searching for "Sorry" by Justin Bieber...
Done.
Searching for "Mistletoe" by Justin Bieber...
Done.
Searching for "Beauty And A Beat" by Justin Bieber...
Done.
Searching for "Baby" by Justin Bieber...
Done.
Searching for "bad guy" by Justin Bieber...
Done.
Searching for "I Don't Care (with Justin Bieber)" by Justin Bieber...
Done.
Searching for "What Do You Mean?" by Justin Bieber...
Done.
Searching for "Die With A Smile" by Bruno Mars...
Done.
Searching for "APT." by Bruno Mars...
Done.
Searching for "Locked out of Heaven" by Bruno Mars...
Done.
Searching for "Just the Way You Are" by Bruno Mars...
Done.
Searching for "That's What I Like" by Bruno Mars...
Done.
Searching for "When I Was Your Man" by Bruno Mars...
Done.
Searching for "Grenade" by Bruno Mars...
Done.
Searching for "It Will Rain" by Bruno Mars...
Done.
Searc

In [31]:
lyrics = pd.read_csv('../data/raw/lyrics_df.csv')

# Step 2: Use regex to remove text between "embed" and "lyrics", including the words "embed" and "lyrics"
lyrics['lyrics'] = lyrics['lyrics'].str.replace(r'embed.*?lyrics', '', regex=True)

In [32]:
lyrics['lyrics'] = lyrics['lyrics'].str.replace(
    r'ContributorsTranslations.*?Lyrics', 'Lyrics', regex=True
)

In [33]:
error_rows = lyrics[lyrics['lyrics'].str.contains('Error fetching lyrics|not found', na=False)]
error_rows

Unnamed: 0,name,artist,gender,lyrics
2,Love Yourself,Justin Bieber,0,Error fetching lyrics for 'Love Yourself' by J...
20,Shape of You,Ed Sheeran,0,Error fetching lyrics for 'Shape of You' by Ed...
40,"Happy - From ""Despicable Me 2""",Pharrell Williams,0,"Lyrics for 'Happy - From ""Despicable Me 2""' by..."
53,Umbrella,Rihanna,1,Error fetching lyrics for 'Umbrella' by Rihann...


In [34]:
# Apply the fetch_lyrics function to the error rows
new_lyrics = error_rows.apply(fetch_lyrics, axis=1)

# Update the 'lyrics' column in the original DataFrame
lyrics.loc[error_rows.index, 'lyrics'] = new_lyrics

# Check if any errors remain
error_lyrics = lyrics[lyrics['lyrics'].str.contains('Error fetching lyrics|not found', na=False)]

Searching for "Love Yourself" by Justin Bieber...
Done.
Searching for "Shape of You" by Ed Sheeran...
Done.
Searching for "Happy - From "Despicable Me 2"" by Pharrell Williams...
No results found for: 'Happy - From "Despicable Me 2" Pharrell Williams'
Searching for "Umbrella" by Rihanna...
Done.


In [37]:
lyrics.to_csv('../data/raw/lyrics_df.csv', index=False)

lyrics = pd.read_csv('../data/raw/lyrics_df.csv')

In [41]:
happy_lyrics = get_song_lyrics_with_variations("Happy", "Pharrell Williams")

# Check if the song exists in the DataFrame
row_index = lyrics[
    (lyrics['name'] == "Happy") & 
    (lyrics['artist'] == "Pharrell Williams")
].index

# If the song is found, update its lyrics
if not row_index.empty:
    lyrics.at[row_index[0], 'lyrics'] = lyrics
    print(f"Lyrics updated for: {"Happy"} by {"Pharrell Williams"}")
else:
    print(f"Song '{"Happy"}' by {"Pharrell Williams"} not found in the DataFrame.")

lyrics.to_csv('../data/raw/lyrics_df.csv', index=False)

lyrics = pd.read_csv('../data/raw/lyrics_df.csv')

Searching for: Happy
Searching for "Happy" by Pharrell Williams...
Done.
Found lyrics for: Happy
Song 'Happy' by Pharrell Williams not found in the DataFrame.


Here, I am manually looking at the rows where the lyrics were not available (not an error but an issue of availability). In the next chunk, I'll be removing these

In [44]:
# Read the CSV and refetch lyrics in one step
indices = [85, 86, 72, 68, 65, 53, 49, 45, 41, 33, 31, 26, 8, 0]

lyrics = refetch_lyrics_for_top_tracks(lyrics, indices)

Searching for "Swalla Nicki Minaj" by Nicki Minaj...
Done.
Searching for "Super Nicki Minaj" by Nicki Minaj...
Done.
Searching for "I Taylor Swift" by Taylor Swift...
Done.
Searching for "Die Ariana Grande" by Ariana Grande...
Done.
Searching for "Save Ariana Grande" by Ariana Grande...
Done.
Searching for "Umbrella Rihanna" by Rihanna...
Done.
Searching for "Cash Pharrell Williams" by Pharrell Williams...
Done.
Searching for "Get Pharrell Williams" by Pharrell Williams...
Done.
Searching for "Committed Pharrell Williams" by Pharrell Williams...
Done.
Searching for "My Flo Rida" by Flo Rida...
Done.
Searching for "Right Flo Rida" by Flo Rida...
Done.
Searching for "Merry Ed Sheeran" by Ed Sheeran...
Done.
Searching for "I Justin Bieber" by Justin Bieber...
Done.
Searching for "STAY Justin Bieber" by Justin Bieber...
Done.


In [45]:
lyrics_to_update = lyrics.loc[indices]
lyrics_to_update = lyrics_to_update[lyrics_to_update['lyrics'].notnull()]

# Update the lyrics in the original dataframe for those rows
lyrics.update(lyrics_to_update)

# Save the updated dataframe back to the raw lyrics CSV
lyrics.to_csv('../data/raw/lyrics_df.csv', index=False)

lyrics = pd.read_csv('../data/raw/lyrics_df.csv')

There are still songs with missing lyrics. So, I will once again manually look at those lyrics and refetch them using a function. As none of the incorrect lyrics share any common characteristics (they are all "top tracks" lists from when the song was released, I believe), I had to do this part manually.

In [None]:
# Creating a song artist list to refecth lyrics
song_artist_list = [("AGATS2 (Insecure)", "Nicki Minaj"),
                    ("Fortnight feat. Post Malone", "Taylor Swift"),
                    ("Umbrella", "Rihanna"),
                    ("Hit Different", "Pharrell Williams"),
                    ("Feels", "Pharrell Williams"),
                    ("My House", "Flo Rida"),
                    ("Wild Ones", "Flo Rida"),
                    ("Right Round", "Flo Rida"),
                    ("Low (feat. T-Pain)", "Flo Rida"),
                    ("I Don't Care", "Ed Sheeran")]

lyrics = refetch_lyrics_for_top_tracks_by_name(lyrics, song_artist_list)

Song 'AGATS2 (Insecure)' by Nicki Minaj not found in the dataframe.
Song 'Fortnight feat. Post Malone' by Taylor Swift not found in the dataframe.
Searching for "Umbrella Rihanna" by Rihanna...
Done.
Lyrics updated for: Umbrella by Rihanna
Searching for "Hit Pharrell Williams" by Pharrell Williams...
Done.
Lyrics updated for: Hit Different by Pharrell Williams
Song 'Feels' by Pharrell Williams not found in the dataframe.
Searching for "My Flo Rida" by Flo Rida...
Done.
Lyrics updated for: My House by Flo Rida
Song 'Wild Ones' by Flo Rida not found in the dataframe.
Searching for "Right Flo Rida" by Flo Rida...
Done.
Lyrics updated for: Right Round by Flo Rida
Searching for "Low Flo Rida" by Flo Rida...
Done.
Lyrics updated for: Low (feat. T-Pain) by Flo Rida
Song 'I Don't Care' by Ed Sheeran not found in the dataframe.


In [54]:
lyrics.to_csv('../data/raw/lyrics_df.csv', index=False)
lyrics = pd.read_csv("../data/raw/lyrics_df.csv")

In [59]:
# Preprocess lyrics and save directly to a CSV file
lyrics['Lyrics'] = lyrics['Lyrics'].apply(preprocess_lyrics)

In [60]:
# Renaming the columns in my lyrics dataframe
# Matching those in the playlists dataframe, as they are more readable/explanatory
lyrics.rename(columns={
    'name': 'Track Name',
    'artist': 'Artists',
    'gender': 'Gender',
    'lyrics': 'Lyrics'
}, inplace=True)

# Saving the updated dataframe back to the CSV
lyrics.to_csv('../data/processed/final_lyrics.csv', index=False)

In [61]:
# Loading the dataframes from the CSV files
clean_playlists_df = pd.read_csv('../data/processed/playlists_clean.csv')
final_lyrics_final_df = pd.read_csv('../data/processed/final_lyrics.csv')

# Cleaning both 'Track Name' and 'Artists' columns in both dataframes
clean_playlists_df['Track Name'] = clean_playlists_df['Track Name'].str.strip().str.lower()
clean_playlists_df['Artists'] = clean_playlists_df['Artists'].str.strip().str.lower()

final_lyrics_final_df['Track Name'] = final_lyrics_final_df['Track Name'].str.strip().str.lower()
final_lyrics_final_df['Artists'] = final_lyrics_final_df['Artists'].str.strip().str.lower()

# Connecting to and creating the SQLite database
conn = sqlite3.connect('../data/spotify.db')
cursor = conn.cursor()

# Enabling foreign key support, to later define Artists as the foreign key between the two tables
cursor.execute("PRAGMA foreign_keys = ON;")

# Dropping tables if they exist to ensure clean setup
cursor.execute("DROP TABLE IF EXISTS playlists;")
cursor.execute("DROP TABLE IF EXISTS top_tracks;")

# Creating the 'playlists' table with 'Artists' as a primary key
cursor.execute("""
CREATE TABLE IF NOT EXISTS playlists (
    "Track Name" TEXT NOT NULL,
    "Track ID" TEXT,
    "Artists" TEXT NOT NULL,
    "Artist Count" INTEGER,
    PRIMARY KEY ("Artists")
);
""")

# Inserting data from 'clean_playlists_df' into the 'playlists' table
cursor.executemany("""
INSERT OR REPLACE INTO playlists ("Track Name", "Track ID", "Artists", "Artist Count")
VALUES (?, ?, ?, ?)
""", clean_playlists_df[['Track Name', 'Track ID', 'Artists', 'Artist Count']].values)

# Creating the 'top_tracks' table with foreign key reference to 'Artists'
cursor.execute("""
CREATE TABLE IF NOT EXISTS top_tracks (
    "Track Name" TEXT NOT NULL,
    "Artists" TEXT NOT NULL,
    "Gender" TEXT,
    "Lyrics" TEXT,
    FOREIGN KEY ("Artists") REFERENCES playlists("Artists")
);
""")

# Fetch artists from the playlists table
cursor.execute('SELECT "Artists" FROM playlists')
existing_artists = set(row[0] for row in cursor.fetchall())  # Create a set of existing artists

# Filter the final_lyrics_final_df DataFrame to include only matching artists
filtered_df = final_lyrics_final_df[final_lyrics_final_df['Artists'].isin(existing_artists)]

# Insert matching rows into the 'top_tracks' table using executemany
cursor.executemany("""
INSERT OR REPLACE INTO top_tracks ("Track Name", "Artists", "Gender", "Lyrics")
VALUES (?, ?, ?, ?)
""", filtered_df[['Track Name', 'Artists', 'Gender', 'Lyrics']].values)

# Check how many rows didn't match
no_match_count = len(final_lyrics_final_df) - len(filtered_df)
print(f"\nTotal number of 'No match found' entries: {no_match_count}")

# Committing the changes to the database
conn.commit()

# Checking how many matches were missing
print(f"\nTotal number of 'No match found' entries: {no_match_count}")

# Verifying the data in the tables
cursor.execute("SELECT * FROM playlists LIMIT 5;")
print("Playlists Table Data:")
print(cursor.fetchall())

cursor.execute("SELECT * FROM top_tracks LIMIT 5;")
print("Top Tracks Table Data:")
print(cursor.fetchall())

# Closing the connection
conn.close()


Total number of 'No match found' entries: 0

Total number of 'No match found' entries: 0
Playlists Table Data:
[('glad you came', '5yDL13y5giogKs2fSNf7sj', 'the wanted', 1), ('closer', '7BKLCZ1jbUBVqRi2FVlTVw', 'halsey', 2), ('peaches feat daniel caesar  giveon', '4iJyoBOLtHqaGxP12qzhQI', 'daniel caesar', 3), ('peaches feat daniel caesar  giveon', '4iJyoBOLtHqaGxP12qzhQI', 'giveon', 3), ('a bar song tipsy', '2FQrifJ1N335Ljm3TjTVVf', 'shaboozey', 1)]
Top Tracks Table Data:
[('stay (with justin bieber)', 'justin bieber', '0', " contributorsi do the same thing i told you that i never would\ni told you i'd change even when i knew i never could\ni know that i cant find nobody else as good as you\ni need you to stay need you to stay hey oh\n\n\ni get drunk wake up i'm wasted still\ni realize the time that i wasted here\ni feel like you can't feel the way i feel\noh ill be fucked up if you can't be right here\n\n\noh oohwoah oh oohwoah oohwoah\noh oohwoah oh oohwoah oohwoah\noh oohwoah oh oo

Click [here](https://github.com/lse-ds105/w10-summative-deyavuz/tree/main?tab=readme-ov-file#table-of-contents) to navigate back to the Table of Contents!