# NB02: Data Processing

In [79]:
# Importing the necessary libraries
from dotenv import load_dotenv
from functions import *
from bs4 import BeautifulSoup
from pprint import pprint
from auth import *
import pandas as pd
import json
import sqlite3
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
#! pip install nltk
from nltk.corpus import stopwords
import re

In [3]:
access_token = get_token()

## Step 1
- Reading the top hits playlist and "girly" pop playlist JSON's
- 

In [4]:
with open('../data/raw/combined_top_hits.json', 'r') as f:
    data = json.load(f)

In [5]:
tracks = []
for item in data['items']:
    track = item.get('track', {})
    track_name = track.get('name')
    track_id = track.get('id')
    artists = ", ".join(artist['name'] for artist in track.get('artists', []))  # Combine all artist names
    tracks.append({'Track Name': track_name, 'Track ID': track_id, 'Artists': artists})


In [6]:
top_hits = pd.DataFrame(tracks)
top_hits

Unnamed: 0,Track Name,Track ID,Artists
0,Into You,76FZM38RC8XaAjJ77CVTNe,Ariana Grande
1,Glad You Came,5yDL13y5giogKs2fSNf7sj,The Wanted
2,Dark Horse,5jrdCoLpJSvHHorevXBATy,"Katy Perry, Juicy J"
3,Who Knew - Edit,2hns6Dv29Yrg68AVTJiAyA,P!nk
4,Closer,7BKLCZ1jbUBVqRi2FVlTVw,"The Chainsmokers, Halsey"
...,...,...,...
268,Training Season,6Qb7YsAqH4wWFUMbGsCpap,Dua Lipa
269,What Makes You Beautiful,4cluDES4hQEUhmXj6TXkSo,One Direction
270,I'm Not The Only One,7795WJLVKJoAyVoOtCWqXN,Sam Smith
271,Stockholm,198asGCZWwoQVdLxYSlPTx,Jubël


In [7]:
# Step 1: Normalize the artist names
top_hits['Artists'] = top_hits['Artists'].str.lower().str.strip()

# Step 2: Handle featured artists and combine them into the same 'Artists' column
top_hits['Artists'] = top_hits['Artists'].apply(combine_artists)

# Step 3: Clean up the 'feat' section, making sure it looks clean
top_hits['Artists'] = top_hits['Artists'].str.replace('feat.', 'feat', case=False)

# Step 4: Remove any extra spaces around the artists' names and count the number of artists
top_hits['Artists'] = top_hits['Artists'].str.strip()
top_hits['Artist Count'] = top_hits['Artists'].apply(lambda x: len(set(x.split(','))))

# Display to verify changes
print(top_hits[['Track Name', 'Artists', 'Artist Count']].head())

        Track Name                   Artists  Artist Count
0         Into You             ariana grande             1
1    Glad You Came                the wanted             1
2       Dark Horse       katy perry, juicy j             2
3  Who Knew - Edit                      p!nk             1
4           Closer  the chainsmokers, halsey             2


In [8]:
duplicates = top_hits.duplicated()
print(duplicates.any()) 

True


In [9]:
duplicate_rows = top_hits[top_hits.duplicated()]
print(duplicate_rows)

            Track Name                Track ID       Artists  Artist Count
184  Sign of the Times  5Ohxk2dO5COHF1krpoPigN  harry styles             1


In [10]:
top_hits_clean = top_hits.drop_duplicates(keep='first')

top_hits_clean = top_hits_clean.copy()

top_hits_clean.loc[:, 'Track Name'] = top_hits_clean['Track Name'].str.lower()
top_hits_clean.loc[:, 'Artists'] = top_hits_clean['Artists'].str.lower()
top_hits_clean.loc[:, 'Track Name'] = top_hits_clean['Track Name'].str.strip()
top_hits_clean.loc[:, 'Artists'] = top_hits_clean['Artists'].str.strip()

top_hits_clean = top_hits_clean.assign(Artists=top_hits_clean['Artists'].str.split(', ')).explode('Artists')
top_hits_clean['Artists'] = top_hits_clean['Artists'].str.replace(', ', ' & ')

top_hits_clean['Track Name'] = top_hits_clean['Track Name'].str.replace(r'[^\w\s]', '', regex=True)

top_hits_clean['Artists'] = top_hits_clean['Artists'].astype('category')

top_hits_clean.to_csv('../data/top_hits.csv', index=False)

top_hits_data = pd.read_csv('../data/raw/top_hits.csv')
top_hits_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Track Name    347 non-null    object
 1   Track ID      347 non-null    object
 2   Artists       347 non-null    object
 3   Artist Count  347 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 11.0+ KB


#### Here, I am moving onto the "girly" pop music playlist data

In [11]:
with open('../data/raw/combined_women_pop.json', 'r') as f:
    data = json.load(f)

In [12]:
tracks = []
for item in data['items']:
    track = item.get('track', {})
    track_name = track.get('name')
    track_id = track.get('id')
    artists = ", ".join(artist['name'] for artist in track.get('artists', []))  # Combine all artist names
    tracks.append({'Track Name': track_name, 'Track ID': track_id, 'Artists': artists})


In [13]:
women_pop = pd.DataFrame(tracks)
women_pop

Unnamed: 0,Track Name,Track ID,Artists
0,It's ok I'm ok,24XihnoVPWXlKJ4BgXqjVM,Tate McRae
1,Espresso,2qSkIjg1o9h3YT9RAgYN75,Sabrina Carpenter
2,That’s So True,7ne4VBA60CxGM75vw0EYad,Gracie Abrams
3,Teenage Dream,55qBw1900pZKfXJ6Q9A2Lc,Katy Perry
4,bye,1Rweq7vNjK4kZBbGrmxQsl,Ariana Grande
...,...,...,...
275,Pour It Up,5r67bGhYXZNqk2m2Wupfmu,Rihanna
276,Right Now,42Ow7PS3YtCWplolUUigDo,"Rihanna, David Guetta"
277,Pink Pony Club,1k2pQc5i348DCHwbn5KTdc,Chappell Roan
278,Si Antes Te Hubiera Conocido,6WatFBLVB0x077xWeoVc2k,KAROL G


In [14]:
# Step 1: Normalize the artist names
women_pop['Artists'] = women_pop['Artists'].str.lower().str.strip()

# Step 2: Handle featured artists and combine them into the same 'Artists' column
def combine_artists(artist_column):
    # Check if 'feat' exists, and if so, split and merge artists
    if 'feat' in artist_column:
        # Split the main artist and featured artist(s) and remove any extra spaces
        artists = artist_column.split('feat')
        main_artist = artists[0].strip()
        featured_artists = artists[1].strip()
        # Combine main artist with featured artists, avoiding duplicates
        combined_artists = main_artist + ' feat ' + ', '.join(sorted(set(featured_artists.split(','))))
    else:
        # If no featured artists, return the original
        combined_artists = artist_column
    return combined_artists

women_pop['Artists'] = women_pop['Artists'].apply(combine_artists)

# Step 3: Clean up the 'feat' section, making sure it looks clean
women_pop['Artists'] = women_pop['Artists'].str.replace('feat.', 'feat', case=False)

# Step 4: Remove any extra spaces around the artists' names and count the number of artists
women_pop['Artists'] = women_pop['Artists'].str.strip()
women_pop['Artist Count'] = women_pop['Artists'].apply(lambda x: len(set(x.split(','))))

# Display to verify changes
print(women_pop[['Track Name', 'Artists', 'Artist Count']].tail())

                                Track Name                Artists  \
275                             Pour It Up                rihanna   
276                              Right Now  rihanna, david guetta   
277                         Pink Pony Club          chappell roan   
278           Si Antes Te Hubiera Conocido                karol g   
279  You'll Always Find Your Way Back Home         hannah montana   

     Artist Count  
275             1  
276             2  
277             1  
278             1  
279             1  


In [15]:
duplicates = women_pop.duplicated()
print(duplicates.any()) 

True


In [16]:
duplicate_rows = women_pop[women_pop.duplicated()]
print(duplicate_rows)

      Track Name                Track ID    Artists  Artist Count
201  Bad Romance  0SiywuOBRcynK0uKGWdCnn  lady gaga             1


In [None]:
women_pop_clean = women_pop.drop_duplicates(keep='first')

women_pop_clean = women_pop_clean.copy()

women_pop_clean.loc[:, 'Track Name'] = women_pop_clean['Track Name'].str.lower()
women_pop_clean.loc[:, 'Artists'] = women_pop_clean['Artists'].str.lower()
women_pop_clean.loc[:, 'Track Name'] = women_pop_clean['Track Name'].str.strip()
women_pop_clean.loc[:, 'Artists'] = women_pop_clean['Artists'].str.strip()

women_pop_clean = women_pop_clean.assign(Artists=women_pop_clean['Artists'].str.split(', ')).explode('Artists')
women_pop_clean['Artists'] = women_pop_clean['Artists'].str.replace(', ', ' & ')

women_pop_clean['Track Name'] = women_pop_clean['Track Name'].str.replace(r'[^\w\s]', '', regex=True)

women_pop_clean['Artists'] = women_pop_clean['Artists'].astype('category')

women_pop_clean.to_csv('../data/raw/women_pop.csv', index=False)

women_pop_data = pd.read_csv('../data/raw/women_pop.csv')
women_pop_data.info()

In [None]:
playlist_df = pd.concat([top_hits_data, women_pop_data], ignore_index=True)

playlist_df.to_csv('../data/processed/playlists.csv', index=False)

print(playlist_df)

                               Track Name                Track ID  \
0                                into you  76FZM38RC8XaAjJ77CVTNe   
1                           glad you came  5yDL13y5giogKs2fSNf7sj   
2                              dark horse  5jrdCoLpJSvHHorevXBATy   
3                              dark horse  5jrdCoLpJSvHHorevXBATy   
4                          who knew  edit  2hns6Dv29Yrg68AVTJiAyA   
..                                    ...                     ...   
679                             right now  42Ow7PS3YtCWplolUUigDo   
680                             right now  42Ow7PS3YtCWplolUUigDo   
681                        pink pony club  1k2pQc5i348DCHwbn5KTdc   
682          si antes te hubiera conocido  6WatFBLVB0x077xWeoVc2k   
683  youll always find your way back home  12wSL3tGk3MtbDEhfG7xy3   

            Artists  Artist Count  
0     ariana grande             1  
1        the wanted             1  
2        katy perry             2  
3           juicy j        

In [None]:
playlist_df = pd.read_csv('../data/processed/playlists.csv')

duplicates = playlist_df[playlist_df.duplicated()]

print(duplicates)

                                       Track Name                Track ID  \
352                                good luck babe  0WbMK4wrZ1wFSty9F7FCgu   
353                                 call me maybe  20I6sIOMTCkB6w7ryavxtO   
356                                         apple  19RybK6XDbAVpcdxSbZL1o   
370                                        greedy  3rUGC1vUpkDG9CZFHMur1t   
371                               i kissed a girl  14iN3o8ptQ8cFVZTEmyQRV   
372                              party in the usa  3E7dfMvvCLUddWissuqMwr   
374                                       yes and  7gaA3wERFkFkgivjwbSvkG   
389                            oopsi did it again  6naxalmIoLFWR0siv8dnQQ   
393                                     telephone  6nCDnzErqalOaIY3EJM8NK   
394                                     telephone  6nCDnzErqalOaIY3EJM8NK   
399                                  we cant stop  2y4lAQpi5VTNLu2ldeTdUH   
417        dance the night  from barbie the album  1vYXt7VSjH9JIM5oRRo7vA   

In [67]:
# Remove duplicates (keep the first occurrence)
playlists_clean = playlist_df.drop_duplicates()

# Or, if you want to remove duplicates based on specific columns (e.g., 'Track Name' and 'Artists')
playlists_clean = playlist_df.drop_duplicates(subset=['Track Name', 'Artists'])

# Display the cleaned DataFrame
print(playlists_clean)

playlists_clean.to_csv('../data/processed/clean_playlists.csv', index=False)

                               Track Name                Track ID  \
0                                into you  76FZM38RC8XaAjJ77CVTNe   
1                           glad you came  5yDL13y5giogKs2fSNf7sj   
2                              dark horse  5jrdCoLpJSvHHorevXBATy   
3                              dark horse  5jrdCoLpJSvHHorevXBATy   
4                          who knew  edit  2hns6Dv29Yrg68AVTJiAyA   
..                                    ...                     ...   
679                             right now  42Ow7PS3YtCWplolUUigDo   
680                             right now  42Ow7PS3YtCWplolUUigDo   
681                        pink pony club  1k2pQc5i348DCHwbn5KTdc   
682          si antes te hubiera conocido  6WatFBLVB0x077xWeoVc2k   
683  youll always find your way back home  12wSL3tGk3MtbDEhfG7xy3   

            Artists  Artist Count  
0     ariana grande             1  
1        the wanted             1  
2        katy perry             2  
3           juicy j        

## Step 2


In [72]:
clean_df = '../data/processed/clean_playlists.csv'
clean_df = pd.read_csv(clean_df)

# Display the first few rows to understand the structure of the data
print(clean_df.head())

# Select the 'artists' column (similar to SQL SELECT artists FROM playlists)
artists_df = clean_df[['Artists']]

# Display the first few rows of the artists column to check the data
print(artists_df.head())

       Track Name                Track ID        Artists  Artist Count
0        into you  76FZM38RC8XaAjJ77CVTNe  ariana grande             1
1   glad you came  5yDL13y5giogKs2fSNf7sj     the wanted             1
2      dark horse  5jrdCoLpJSvHHorevXBATy     katy perry             2
3      dark horse  5jrdCoLpJSvHHorevXBATy        juicy j             2
4  who knew  edit  2hns6Dv29Yrg68AVTJiAyA           p!nk             1
         Artists
0  ariana grande
1     the wanted
2     katy perry
3        juicy j
4           p!nk


In [73]:
# Get the frequency of each artist
artist_counts = artists_df['Artists'].value_counts()
top_50_artists = artist_counts.head(50)

# Display the frequency of artists
print(top_50_artists)

Artists
rihanna                     25
ariana grande               19
taylor swift                18
nicki minaj                 17
britney spears              14
lady gaga                   14
katy perry                  13
beyoncé                     12
demi lovato                  9
dua lipa                     9
little mix                   8
selena gomez                 8
justin bieber                7
paramore                     7
bruno mars                   6
charli xcx                   6
doja cat                     6
ava max                      6
avril lavigne                6
selena gomez & the scene     6
sabrina carpenter            6
destiny's child              5
jacob tillberg               5
kesha                        5
miley cyrus                  5
christina aguilera           4
sia                          4
rita ora                     4
olivia rodrigo               4
ku$h drifter                 4
justina valentine            4
one direction                4


In [74]:
top_50_artists.to_csv('../data/processed/top_50_artists.csv', header=True)
pd.read_csv('../data/processed/top_50_artists.csv')

Unnamed: 0,Artists,count
0,rihanna,25
1,ariana grande,19
2,taylor swift,18
3,nicki minaj,17
4,britney spears,14
5,lady gaga,14
6,katy perry,13
7,beyoncé,12
8,demi lovato,9
9,dua lipa,9


Looking at the dataframe, these are the artists that will be included in my analysis:

| Male Artists | Female Arists |
| :--: | :--: |
| Justin Bieber | Rihanna |
| Bruno Mars | Ariana Grande |
| Ed Sheeran | Taylor Swift |
| Flo Rida | Nicki Minaj |
| Pharrell Williams | Britney Spears |

### Getting tracks

In [75]:
justin = get_top_tracks("1uNFoZAHBGtllmzznpCI3s", access_token)
bruno = get_top_tracks("0du5cEVh5yTK9QJze8zA0C", access_token)
ed = get_top_tracks("6eUKZXaKkcviH0Ku9w2n3V", access_token)
flo = get_top_tracks("0jnsk9HBra6NMjO2oANoPY", access_token)
pharrell = get_top_tracks("2RdwBSPQiwcmiDo9kixcl8", access_token)

In [76]:
rihanna = get_top_tracks("5pKCCKE2ajJHZ9KAiaK11H", access_token)
ariana = get_top_tracks("66CXWjxzNUsdJxJ2JdwvnR", access_token)
taylor = get_top_tracks("06HL4z0CvFAxyc27GXpf02", access_token)
nicki = get_top_tracks("0hCNtLu0JehylgoiP8L4Gh", access_token)
britney = get_top_tracks("26dSoYclwsYLMAKD3tpOr4", access_token)

In [34]:
artists = [justin, bruno, ed, flo, pharrell, rihanna, ariana, taylor, nicki, britney]
artist_names = ['justin', 'bruno', 'ed', 'flo', 'pharrell', 'rihanna', 'ariana', 'taylor', 'nicki', 'britney']

# Create a dictionary of DataFrames for each artist
artist_dfs = {name: pd.DataFrame(artist['tracks']) for name, artist in zip(artist_names, artists)}

In [35]:
artist_names = ['Justin Bieber', 'Bruno Mars', 'Ed Sheeran', 'Flo Rida', 'Pharrell Williams', 
                'Rihanna', 'Ariana Grande', 'Taylor Swift', 'Nicki Minaj', 'Britney Spears']

# Assign artist name to each DataFrame
for name, artist_df in zip(artist_names, artist_dfs.values()):
    artist_df['artist'] = name

In [36]:
combined_tracks = pd.concat(artist_dfs.values(), ignore_index=True)
# Print the combined DataFrame
print(combined_tracks)

                                                album  \
0   {'album_type': 'album', 'artists': [{'external...   
1   {'album_type': 'album', 'artists': [{'external...   
2   {'album_type': 'album', 'artists': [{'external...   
3   {'album_type': 'album', 'artists': [{'external...   
4   {'album_type': 'album', 'artists': [{'external...   
..                                                ...   
95  {'album_type': 'album', 'artists': [{'external...   
96  {'album_type': 'single', 'artists': [{'externa...   
97  {'album_type': 'album', 'artists': [{'external...   
98  {'album_type': 'album', 'artists': [{'external...   
99  {'album_type': 'album', 'artists': [{'external...   

                                              artists  disc_number  \
0   [{'external_urls': {'spotify': 'https://open.s...            1   
1   [{'external_urls': {'spotify': 'https://open.s...            1   
2   [{'external_urls': {'spotify': 'https://open.s...            1   
3   [{'external_urls': {'spotify': 

In [37]:
combined_tracks = combined_tracks[['name', 'artist']]

In [38]:
gender_mapping = {
    'Justin Bieber': 0,  # Male
    'Bruno Mars': 0,     # Male
    'Ed Sheeran': 0,     # Male
    'Flo Rida': 0,       # Male
    'Pharrell Williams': 0,  # Male
    'Rihanna': 1,        # Female
    'Ariana Grande': 1,  # Female
    'Taylor Swift': 1,   # Female
    'Nicki Minaj': 1,    # Female
    'Britney Spears': 1  # Female
}

combined_tracks['gender'] = combined_tracks['artist'].map(gender_mapping)

combined_tracks.to_csv('../data/processed/combined_top_tracks.csv', index=False)

combined_tracks = pd.DataFrame(combined_tracks)
combined_tracks = pd.read_csv('../data/processed/combined_top_tracks.csv')

In [39]:
# Step 3: Apply the function to the DataFrame to fetch lyrics for each song
combined_tracks['lyrics'] = combined_tracks.apply(fetch_lyrics, axis=1)

# Step 4: Save the updated DataFrame with lyrics back to a new CSV
combined_tracks.to_csv('../data/raw/lyrics.csv', index=False)

Searching for "STAY (with Justin Bieber)" by Justin Bieber...
Done.
Searching for "Ghost" by Justin Bieber...
Done.
Searching for "Love Yourself" by Justin Bieber...
Done.
Searching for "Sorry" by Justin Bieber...
Done.
Searching for "Beauty And A Beat" by Justin Bieber...
Done.
Searching for "Baby" by Justin Bieber...
Done.
Searching for "Mistletoe" by Justin Bieber...
Done.
Searching for "bad guy" by Justin Bieber...
Done.
Searching for "I Don't Care (with Justin Bieber)" by Justin Bieber...
Done.
Searching for "What Do You Mean?" by Justin Bieber...
Done.
Searching for "Die With A Smile" by Bruno Mars...
Done.
Searching for "APT." by Bruno Mars...
Done.
Searching for "Locked out of Heaven" by Bruno Mars...
Done.
Searching for "Just the Way You Are" by Bruno Mars...
Done.
Searching for "That's What I Like" by Bruno Mars...
Done.
Searching for "When I Was Your Man" by Bruno Mars...
Done.
Searching for "Grenade" by Bruno Mars...
Done.
Searching for "It Will Rain" by Bruno Mars...
Done.

In [40]:
lyrics = pd.read_csv('../data/raw/lyrics.csv')

# Step 2: Use regex to remove text between "embed" and "lyrics", including the words "embed" and "lyrics"
lyrics['lyrics'] = lyrics['lyrics'].str.replace(r'embed.*?lyrics', '', regex=True)

# Step 3: Save the updated DataFrame to a new CSV
lyrics.to_csv('../data/processed/cleaned_lyrics.csv', index=False)

In [41]:
cleaned_lyrics = pd.read_csv('../data/processed/cleaned_lyrics.csv')
cleaned_lyrics = pd.DataFrame(cleaned_lyrics)

cleaned_lyrics

Unnamed: 0,name,artist,gender,lyrics
0,STAY (with Justin Bieber),Justin Bieber,0,1 ContributorToday’s Top Hits 11/5/21 (feat. P...
1,Ghost,Justin Bieber,0,116 ContributorsTranslationsDeutschTürkçeEspañ...
2,Love Yourself,Justin Bieber,0,223 ContributorsTranslationsEspañolPortuguêsDe...
3,Sorry,Justin Bieber,0,202 ContributorsTranslationsEspañolPortuguêsFr...
4,Beauty And A Beat,Justin Bieber,0,134 ContributorsTranslationsУкраїнськаBeauty a...
...,...,...,...,...
95,Womanizer,Britney Spears,1,67 ContributorsTranslations한국어日本語Womanizer Lyr...
96,My Only Wish (This Year),Britney Spears,1,23 ContributorsMy Only Wish (This Year) Lyrics...
97,Hold Me Closer,Britney Spears,1,35 ContributorsTranslationsEspañolPortuguêsHol...
98,Criminal,Britney Spears,1,62 ContributorsTranslationsDeutschTürkçeCrimin...


In [42]:
cleaned_lyrics['lyrics'] = cleaned_lyrics['lyrics'].str.replace(
    r'ContributorsTranslations.*?Lyrics', 'Lyrics', regex=True
)

# Step 3: Save the updated DataFrame to a new CSV
cleaned_lyrics.to_csv('../data/processed/final_lyrics.csv', index=False)

# Reload the cleaned CSV for verification
final_lyrics = pd.read_csv('../data/processed/final_lyrics.csv')
print(final_lyrics.head())

                        name         artist  gender  \
0  STAY (with Justin Bieber)  Justin Bieber       0   
1                      Ghost  Justin Bieber       0   
2              Love Yourself  Justin Bieber       0   
3                      Sorry  Justin Bieber       0   
4          Beauty And A Beat  Justin Bieber       0   

                                              lyrics  
0  1 ContributorToday’s Top Hits 11/5/21 (feat. P...  
1  116 Lyrics[Verse 1]\nYoungblood thinks there's...  
2  223 Lyrics[Verse 1]\nFor all the times that yo...  
3  202 Lyrics[Written by Julia Michaels, Justin T...  
4  134 Lyrics[Intro: Nicki Minaj]\nYeah\nYoung Mo...  


In [43]:
error_rows = final_lyrics[final_lyrics['lyrics'].str.contains('Error fetching lyrics|not found', na=False)]

# Display the rows with errors
print(error_rows)

                              name             artist  gender  \
20                    Shape of You         Ed Sheeran       0   
40  Happy - From "Despicable Me 2"  Pharrell Williams       0   
53                        Umbrella            Rihanna       1   
90                           Toxic     Britney Spears       1   
91           ...Baby One More Time     Britney Spears       1   

                                               lyrics  
20  Error fetching lyrics for 'Shape of You' by Ed...  
40  Lyrics for 'Happy - From "Despicable Me 2"' by...  
53  Error fetching lyrics for 'Umbrella' by Rihann...  
90  Error fetching lyrics for 'Toxic' by Britney S...  
91  Error fetching lyrics for '...Baby One More Ti...  


In [44]:
for index, row in error_rows.iterrows():
    try:
        # Fetch lyrics using the function
        final_lyrics.at[index, 'lyrics'] = get_song_lyrics(row['name'], row['artist'])
    except Exception as e:
        print(f"Failed to fetch lyrics for {row['name']} by {row['artist']}: {e}")

# Save the updated DataFrame back to the CSV
final_lyrics.to_csv('../data/processed/final_lyrics_updated.csv', index=False)

Searching for "Shape of You" by Ed Sheeran...
Done.
Searching for "Happy - From "Despicable Me 2"" by Pharrell Williams...
No results found for: 'Happy - From "Despicable Me 2" Pharrell Williams'
Searching for "Umbrella" by Rihanna...
Done.
Searching for "Toxic" by Britney Spears...
Done.
Searching for "...Baby One More Time" by Britney Spears...
Done.


In [45]:
error_rows = final_lyrics[final_lyrics['lyrics'].str.contains('Error fetching lyrics|not found', na=False)]

# Display the rows with errors
print(error_rows)

                              name             artist  gender  \
40  Happy - From "Despicable Me 2"  Pharrell Williams       0   

                                               lyrics  
40  Lyrics for 'Happy - From "Despicable Me 2"' by...  


In [46]:
happy = get_song_lyrics_with_variations("Happy", "Pharrell Williams")

final_lyrics_updated = pd.read_csv('../data/processed/final_lyrics_updated.csv')

# Check if the song exists in the DataFrame
row_index = final_lyrics_updated[
    (final_lyrics_updated['name'] == "Happy") & 
    (final_lyrics_updated['artist'] == "Pharrell Williams")
].index

# If the song is found, update its lyrics
if not row_index.empty:
    final_lyrics_updated.at[row_index[0], 'lyrics'] = lyrics
    print(f"Lyrics updated for: {"Happy"} by {"Pharrell Williams"}")
else:
    print(f"Song '{"Happy"}' by {"Pharrell Williams"} not found in the DataFrame.")

# Save the updated DataFrame
final_lyrics_updated.to_csv('../data/processed/final_lyrics_updated.csv', index=False)

Searching for: Happy
Searching for "Happy" by Pharrell Williams...
Done.
Found lyrics for: Happy
Song 'Happy' by Pharrell Williams not found in the DataFrame.


Here, I am manually looking at the rows where the lyrics were not available (not an error but an issue of availability). In the next chunk, I'll be removing these

In [None]:
# Function to fetch song lyrics using the first word of the song title
def get_song_lyrics_with_first_word(song_title, artist_name):
    # Extract the first word from the song title
    first_word = song_title.split(" ")[0]
    
    # Use the first word and artist to search for the song
    song = genius.search_song(f"{first_word} {artist_name}", artist_name)
    
    if song:
        lyrics = song.lyrics
        
        # Clean the lyrics: remove unwanted parts (e.g., contributor info, translations)
        cleaned_lyrics = re.sub(r'(\d+ Contributors|Translations.*?)(\n|$)', '', lyrics)
        
        # Optionally, you can remove any repeated phrases or patterns here
        cleaned_lyrics = re.sub(r'\[.*?\]', '', cleaned_lyrics)  # Remove [Chorus], [Verse 1], etc.
        
        return cleaned_lyrics
    else:
        return f"No lyrics found for: {song_title} by {artist_name}"

# Load the updated lyrics DataFrame
updated_lyrics_df = pd.read_csv('../data/processed/final_lyrics_updated.csv')

# Display the first few rows to check the data structure
print(updated_lyrics_df.head())

# List of indices to refetch
indices_to_refetch = [85, 86, 72, 68, 65, 49, 45, 41, 33, 31, 26, 8, 0]

# Function to refetch lyrics and update the 'lyrics' column for specific rows
def refetch_lyrics_for_top_tracks(df, indices):
    for index in indices:
        # Ensure index is within the DataFrame range
        if index < len(df):
            song_title = df.loc[index, 'name']  # Assuming 'name' is the column for song titles
            artist_name = df.loc[index, 'artist']  # Assuming 'artist' is the column for artist names
            
            # Fetch new lyrics using your function
            new_lyrics = get_song_lyrics_with_first_word(song_title, artist_name)
            
            # Update the 'lyrics' column with the fetched lyrics
            df.loc[index, 'lyrics'] = new_lyrics
        else:
            print(f"Index {index} is out of range.")
    
    return df

# Refetch and update the lyrics for the specific rows
updated_lyrics_df = refetch_lyrics_for_top_tracks(updated_lyrics_df, indices_to_refetch)

# Optionally, check the updated rows
updated_rows = updated_lyrics_df.iloc[indices_to_refetch]  # Get rows by the indices to refetch
print("Updated Rows:")
print(updated_rows[['name', 'artist', 'lyrics']])

# Save the updated DataFrame back to a CSV
updated_lyrics_df.to_csv("../data/processed/final_lyrics_final.csv", index=False)

lyrics = pd.read_csv('../data/processed/final_lyrics_final.csv')


                        name         artist  gender  \
0  STAY (with Justin Bieber)  Justin Bieber       0   
1                      Ghost  Justin Bieber       0   
2              Love Yourself  Justin Bieber       0   
3                      Sorry  Justin Bieber       0   
4          Beauty And A Beat  Justin Bieber       0   

                                              lyrics  
0  1 ContributorToday’s Top Hits 11/5/21 (feat. P...  
1  116 Lyrics[Verse 1]\nYoungblood thinks there's...  
2  223 Lyrics[Verse 1]\nFor all the times that yo...  
3  202 Lyrics[Written by Julia Michaels, Justin T...  
4  134 Lyrics[Intro: Nicki Minaj]\nYeah\nYoung Mo...  
Searching for "Swalla Nicki Minaj" by Nicki Minaj...
Done.
Searching for "Super Nicki Minaj" by Nicki Minaj...
Done.
Searching for "I Taylor Swift" by Taylor Swift...
Done.
Searching for "Dangerous Ariana Grande" by Ariana Grande...
Done.
Searching for "7 Ariana Grande" by Ariana Grande...
Done.
Searching for "Cash Pharrell Williams" by 

In [83]:
lyrics_df_selected = lyrics[['lyrics', 'gender']]

# Check the first few rows of the selected columns
print(lyrics_df_selected.head())

                                              lyrics  gender
0  1 ContributorToday’s Top Hits 11/5/21 (feat. P...       0
1  116 ContributorsTranslationsDeutschTürkçeEspañ...       0
2  223 ContributorsTranslationsEspañolPortuguêsDe...       0
3  202 ContributorsTranslationsEspañolPortuguêsFr...       0
4  134 ContributorsTranslationsУкраїнськаBeauty a...       0


In [None]:
# Preprocessing lyrics to fix anomalies
lyrics_df_selected.loc[:, 'cleaned_lyrics'] = lyrics_df_selected['lyrics'].apply(preprocess_lyrics)

In [88]:
male_artists_df = lyrics_df_selected[lyrics_df_selected['gender'] == 0]
female_artists_df = lyrics_df_selected[lyrics_df_selected['gender'] == 1]

Click [here](https://github.com/lse-ds105/w10-summative-deyavuz/tree/main?tab=readme-ov-file#table-of-contents) to navigate back to the Table of Contents!

In [None]:
# Step 2: Create an SQLAlchemy engine to connect to the SQLite database (or create it if it doesn't exist)
engine = create_engine('sqlite:///../data/spotify_data.db', echo=True)

# Step 3: Save the DataFrame to the SQLite database (table name: 'top_hits')
playlists_clean.to_sql('playlists', engine, if_exists='replace', index=False)

Playlists table schema: [(0, 'Track Name', 'TEXT', 0, None, 0), (1, 'Track ID', 'TEXT', 0, None, 0), (2, 'Artists', 'TEXT', 0, None, 0), (3, 'Artist Count', 'BIGINT', 0, None, 0)]


OperationalError: foreign key mismatch - "top_tracks_new" referencing "playlists"

In [None]:
engine = create_engine('sqlite:///../data/spotify_data.db', echo=True)

# Step 3: Save the DataFrame to the SQLite database (table name: 'top_tracks')
final_lyrics_updated.to_sql('top_tracks', engine, if_exists='replace', index=False)

print("Table 'top_tracks' has been created successfully in the database.")