# Spotify Unwrapped 2024
Copy and paste Spotify and Genius API credentials below.

In [None]:
# spotify api credentials
spotify_client_id = '' # client ID (make sure it's in quotation marks)
spotify_client_secret = '' # client secret (make sure it's in quotation marks)

# genius api credentials
genius_access_token = '' # access token (make sure it's in quotation marks)

Then run that cell (click in, hit shift+enter) and the one below. It should open a new browser tab that prompts you to log in to Spotify. Click authorize and then copy the browser link it redirects you to and paste in the pop-up that appears below this cell. The page will say "this page could not be opened" or something, that's fine.

If it doesn't prompt you to do anything, just runs and doesn't output anything, that's fine too. Just continue.

In [None]:
import spotipy as spt
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials

# spotify client credentials
spu = spt.Spotify(auth_manager=SpotifyOAuth(
    client_id=spotify_client_id,
    client_secret=spotify_client_secret,
    redirect_uri='http://localhost/callback',
    scope='user-top-read playlist-read-private user-library-read',
    cache_path='.cache-username'
))

results = spu.current_user_playlists(limit=1)

Now click on "1. Get data from Spotify", then up to the top bar, then Run -> Run Selected Cell and All Below. This will take some time, especially to read all the Genius lyrics. Expect images to start appearing in a few minutes.

### 1. Get data from Spotify

In [None]:
# import packages
import pandas as pd
import numpy as np
import lyricsgenius
from textblob import TextBlob
import re
import joblib
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from PIL import Image, ImageDraw, ImageFont
import requests
from io import BytesIO
import os
import time

In [None]:
# pulling most recent user playlist (top songs 2024)
for item in results['items']:
    if item is not None:
        playlist_id = item['uri']

In [None]:
# getting information about these songs from spotify api

def get_playlist_songs(playlist):
    track_ids = []
    track_names = []
    artist_ids = []
    artist_names = []
    release_date = []
    popularity = []
    length = []
    
    data = spu.playlist_items(playlist)
    
    for page in range(0, data['total'] // 100 + 1):
        data = spu.playlist_items(playlist, limit=100, offset=100 * page)
        
        for item in data['items']:
            if item['track'] is None or item['track']['type'] == 'episode':
                continue
            track_ids.append(item['track']['id'])
            track_names.append(item['track']['name'])
            artist_ids.append(item['track']['artists'][0]['id'])
            artist_names.append(item['track']['artists'][0]['name'])
            release_date.append(item['track']['album']['release_date'])
            popularity.append(item['track']['popularity'])
            length.append(item['track']['duration_ms'])
    
    data_track = {
        'name': track_names,
        'id': track_ids,
        'artist': artist_names,
        'artist_id': artist_ids,
        'popularity': popularity,
        'release_date': release_date,
        'length': length
    }
    
    df_track = pd.DataFrame(data_track)
    return df_track


In [None]:
unique_songs_df = get_playlist_songs(playlist_id)

In [None]:
# getting artist data like genre and popularity from these songs

distinct_artists = unique_songs_df.artist_id.unique().tolist()

artist_list = []
genre_list = []
popularity_list = []
artist_name = []

def artist_analysis(artist_id_list):
    
    for artist_id in artist_id_list:
        artist = spu.artist(artist_id)
        for num_genre in range(len(artist['genres'])):
            artist_list.append(artist_id)
            artist_name.append(artist['name'])
            popularity_list.append(artist['popularity'])
            genre_list.append(artist['genres'][num_genre])
    
    data = {'artist_id': artist_list, 'artist': artist_name, 'artist_popularity': popularity_list, 'genre': genre_list}
    df_genres = pd.DataFrame(data)
    return df_genres

genres = artist_analysis(distinct_artists)

### 2. Get lyrics from Genius

In [None]:
# genius api credentials login
LyricsGenius = lyricsgenius.Genius(
    genius_access_token, verbose = True, skip_non_songs = True, timeout = 15
)

keywords = ['chorus', 'instrumental', 'bridge', 'verse', 'embed', 'lyrics', 'outro', 'intro']
lyrics_df = []

# searching for lyrics for each song

def genius_search(song, artist):
    lyrics = ''
    retries = 0
    while retries < 3:
        try:
            track = LyricsGenius.search_song(song, artist)
            
            # ensure track is not None and contains lyrics
            if track is None:
                retries += 1
                continue
            
            # process the lyrics line by line
            for line in track.lyrics.lower().split('\n'):
                if not any(keyword in line.lower() for keyword in keywords):
                    lyrics += line+' '
                    
            return lyrics
        
        except TimeoutError as e:  
            retries += 1

    # if it fails after 3 retries, return an empty string
    return ''

In [None]:
# creating dataframe with all song lyrics
lyrics_df = pd.DataFrame(columns=['song', 'artist', 'lyrics'])

for song, artist in unique_songs_df[['name','artist']].itertuples(index=False):
    lyrics = genius_search(song, artist)
    time.sleep(1)
    
    temp_df = pd.DataFrame([[song, artist, lyrics]], columns=['song', 'artist', 'lyrics'])
    lyrics_df = pd.concat([lyrics_df, temp_df], ignore_index=True)


### 3. Feature engineering and modeling on Genius lyrics

In [None]:
# sentiment analysis on lyrics using pre-trained model

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # returns a polarity score between -1 (negative) and +1 (positive)

lyrics_df['sentiment'] = lyrics_df['lyrics'].apply(get_sentiment)


In [None]:
# feature engineering on lyrics for clustering
# genius sometimes has random novels/lists instead of lyrics so we're using clustering to automatically remove them

lyrics_df.replace('', np.nan, inplace=True)
lyrics_df = lyrics_df.dropna(subset=['lyrics'])


def count_numbers(text):
    return len(re.findall(r'\d+', text))

# Apply function to each row of the 'lyrics' column
lyrics_df['num_numbers'] = lyrics_df['lyrics'].apply(count_numbers)

def count_dashes(text):
    return len(re.findall(r' - ', text))

lyrics_df['num_dashes'] = lyrics_df['lyrics'].apply(count_dashes)

def count_words(text):
    return len(text.split())

lyrics_df['num_words'] = lyrics_df['lyrics'].apply(count_words)

def count_periods(text):
    return len(re.findall(r'\.', text))

lyrics_df['num_period'] = lyrics_df['lyrics'].apply(count_periods)

lyrics_df['num_per_word'] = lyrics_df['num_words'] / np.where(lyrics_df['num_numbers'] == 0, 1, lyrics_df['num_numbers'])

lyrics_df = lyrics_df[lyrics_df['num_words'] != 0]

In [None]:
# importing my saved clustering model to remove these entries

kmeans = joblib.load('kmeans_model.pkl')
scaler = joblib.load('scaler.pkl')

new_df = lyrics_df[['num_numbers','num_dashes','num_words','num_period','num_per_word']]

scaled_new_features = scaler.transform(new_df)

new_clusters = kmeans.predict(scaled_new_features)

# add the cluster labels to the new dataset
new_df['cluster'] = new_clusters
lyrics_df['cluster'] = new_clusters


In [None]:
# keeping only my 0-cluster (true lyrics)
final_lyrics = lyrics_df[lyrics_df['cluster']==0].reset_index(drop=True)

In [32]:
# importing my saved nlp classification model

# Load the model and tokenizer from Hugging Face Hub
repo_id = "emmakrentz/bert-classification-model"  # Replace with your repo ID

model = BertForSequenceClassification.from_pretrained(repo_id)
tokenizer = BertTokenizer.from_pretrained(repo_id)

print("Model and tokenizer successfully loaded!")



config.json:   0%|          | 0.00/919 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Model and tokenizer successfully loaded!


In [None]:
# nlp classification of song lyrics to seasons

import torch

label_mapping = {0:'fall', 1:'spring', 2:'summer', 3:'winter'}

def predict_lyrics(texts):
    # tokenize the batch of text
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)

    # get predictions
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_labels = torch.argmax(probabilities, dim=1).tolist()
    
    # map back to original labels and return
    mapped_labels = [label_mapping[label] for label in predicted_labels]
    return mapped_labels, probabilities.tolist()

# run predictions on lyrics
texts = final_lyrics["lyrics"].tolist()
batch_size = 16  
predicted_labels = []
all_probabilities = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    labels, probabilities = predict_lyrics(batch)
    predicted_labels.extend(labels)
    all_probabilities.extend(probabilities)

# add predictions back to df
final_lyrics["predicted_season"] = predicted_labels
final_lyrics["prob"] = all_probabilities

In [None]:
# determine what most-listened to season was

group_sizes = final_lyrics.groupby('predicted_season').size()

largest_group = group_sizes.idxmax()

# access the highest indexed entries in that group
largest_group_data = final_lyrics[final_lyrics['predicted_season'] == largest_group]


In [None]:
# gay

gay_artists = ['Chappell Roan','girl in red','MUNA','Charli XCX','Whitney Houston','Mariah Carey',
               'Britney Spears','Sidney Gish','The Last Dinner Party','Tegan and Sara','Kylie Minogue']

### 4. Creating output images based on data results

In [None]:
# GENRES

# Load the image
input_image_path = "templates_unwrapped/9.png"
output_image_path = "output_unwrapped/06.png"

# Open the image
image = Image.open(input_image_path)

# Create a drawing object
draw = ImageDraw.Draw(image)

# Define the text, font, and position
font_path = 'Recoleta-Bold.ttf'#"GothamBold.ttf"  # Replace with the path to your desired font file
font_size = 90  # Adjust font size as needed
font = ImageFont.truetype(font_path, font_size)

height = [680,1050,1420]
for x in range(3):
    if len(genres.groupby('genre').size().sort_values(ascending=True).index[x])<=13:
        draw.text((420,height[x]), genres.groupby('genre').size().sort_values(ascending=True).index[x], fill="white", font=font)
    else:
        draw.text((420,height[x]-40), genres.groupby('genre').size().sort_values(ascending=True).index[x].split()[0], fill="white", font=font)        
        draw.text((420,height[x]+40), genres.groupby('genre').size().sort_values(ascending=True).index[x].split()[1], fill="white", font=font)        
            
# Save the edited image
image.save(output_image_path)

print(f"Edited image saved as {output_image_path}")


In [None]:
# TOP GENRE

input_image_path = "templates_unwrapped/11.png"
output_image_path = "output_unwrapped/07.png"

# Open the image
image = Image.open(input_image_path)

# Create a drawing object
draw = ImageDraw.Draw(image)

# Define the text, font, and position
font_path = 'Recoleta-Bold.ttf'#"GothamBold.ttf"  # Replace with the path to your desired font file
font_size = 130  # Adjust font size as needed
font = ImageFont.truetype(font_path, font_size)

if len(genres.groupby('genre').size().sort_values(ascending=False).index[0])<=13:
    draw.text((250,1150),genres.groupby('genre').size().sort_values(ascending=False).index[0], fill="white", font=font)
else:
    draw.text((250,1150),genres.groupby('genre').size().sort_values(ascending=False).index[0].split()[0], fill="white", font=font)
    draw.text((250,1250),genres.groupby('genre').size().sort_values(ascending=False).index[0].split()[1], fill="white", font=font)
    
         
# Save the edited image
image.save(output_image_path)

print(f"Edited image saved as {output_image_path}")


In [None]:
# SENTIMENT

input_image_path = "templates_unwrapped/13.png"
output_image_path = "output_unwrapped/08.png"

# Open the image
image = Image.open(input_image_path)

# Create a drawing object
draw = ImageDraw.Draw(image)

# Define the text, font, and position
font_path = 'Recoleta-Bold.ttf'#"GothamBold.ttf"  # Replace with the path to your desired font file
font_size = 130  # Adjust font size as needed
font = ImageFont.truetype(font_path, font_size)

draw.text((185,620), str(round(lyrics_df['sentiment'].sum()/lyrics_df['sentiment'].abs().sum()*100))+'%', fill="white", font=font)
         
# Save the edited image
image.save(output_image_path)

print(f"Edited image saved as {output_image_path}")


In [None]:
# HAPPY AND SAD SONGS
input_image_path = "templates_unwrapped/15.png"
output_image_path = "output_unwrapped/09.png"

# Open the image
image = Image.open(input_image_path)

# Create a drawing object
draw = ImageDraw.Draw(image)

# Define the text, font, and position
font_path = "GothamBold.ttf"  # Replace with the path to your desired font file
font_size = 55  # Adjust font size as needed
font = ImageFont.truetype(font_path, font_size)

artist_font_size = 30
artist_font = ImageFont.truetype(font_path, artist_font_size)

height = [370,520,670,820]
for x in range(4):
    draw.text((130,height[x]), lyrics_df.sort_values(by='sentiment',ascending=False).reset_index(drop=True).loc[x,'song'], fill="white", font=font)
    draw.text((130,height[x]+65), lyrics_df.sort_values(by='sentiment',ascending=False).reset_index(drop=True).loc[x,'artist'], fill="white", font=artist_font)
    draw.text((130,height[x]+800), lyrics_df.sort_values(by='sentiment',ascending=True).reset_index(drop=True).loc[x,'song'], fill="white", font=font)
    draw.text((130,height[x]+865), lyrics_df.sort_values(by='sentiment',ascending=True).reset_index(drop=True).loc[x,'artist'], fill="white", font=artist_font)

# Save the edited image
image.save(output_image_path)

print(f"Edited image saved as {output_image_path}")

In [None]:
# SEASON

if final_lyrics.groupby('predicted_season').size().idxmax() == 'winter':
    
    input_image_path = "templates_unwrapped/winter.png"
    output_image_path = "output_unwrapped/03.png"

elif final_lyrics.groupby('predicted_season').size().idxmax() == 'fall':
    
    input_image_path = "templates_unwrapped/autumn.png"
    output_image_path = "output_unwrapped/03.png"

elif final_lyrics.groupby('predicted_season').size().idxmax() == 'spring':
    
    input_image_path = "templates_unwrapped/spring.png"
    output_image_path = "output_unwrapped/03.png"

elif final_lyrics.groupby('predicted_season').size().idxmax() == 'summer':
    
    input_image_path = "templates_unwrapped/summer.png"
    output_image_path = "output_unwrapped/03.png"
    
# Open the image
image = Image.open(input_image_path)
    
# Create a drawing object
draw = ImageDraw.Draw(image)
    
# Define the text, font, and position
font_path = "GothamBold.ttf"  # Replace with the path to your desired font file
font_size = 55  # Adjust font size as needed
font = ImageFont.truetype(font_path, font_size)
    
artist_font_size = 30
artist_font = ImageFont.truetype(font_path, artist_font_size)
    
height = [850,1000,1150,1300]
for x in range(4):
    draw.text((220,height[x]), largest_group_data.reset_index(drop=True).loc[x,'song'], fill="white", font=font)
    draw.text((220,height[x]+65), largest_group_data.reset_index(drop=True).loc[x,'artist'], fill="white", font=artist_font)

# Save the edited image
image.save(output_image_path)
    
print(f"Edited image saved as {output_image_path}")

In [None]:
# LEAST POPULAR ARTIST

# Input and output paths
input_image_path = "templates_unwrapped/7.png"
output_image_path = "output_unwrapped/05.png"
font_path = 'Recoleta-Bold.ttf'  # Ensure the font file is in the correct location

# Get the smallest artist's image URL and name
artist_data = genres.sort_values(by='artist_popularity').reset_index(drop=True).loc[0]
artist_name = artist_data['artist']
url_image_url = spu.artist(artist_data['artist_id'])['images'][0]['url']

try:
    # Open the main image
    image = Image.open(input_image_path)
    draw = ImageDraw.Draw(image)

    # Fetch the overlay image from the URL
    response = requests.get(url_image_url)
    response.raise_for_status()  # Check for HTTP errors
    overlay_image = Image.open(BytesIO(response.content))

    # Optional: Resize the overlay image
    overlay_image = overlay_image.resize((500, 500))  # Adjust size as needed

    # Define overlay position and paste
    overlay_position = (300, 650)  # Adjust based on your layout
    image.paste(overlay_image, overlay_position)

    # Load font
    if not os.path.exists(font_path):
        raise FileNotFoundError(f"Font file not found: {font_path}")
    font_size = 80
    font = ImageFont.truetype(font_path, font_size)

    # Draw artist name
    text_position = (300, 1180)
    draw.text(text_position, artist_name, fill="white", font=font)

    # Save the edited image
    image.save(output_image_path)
    print(f"Edited image saved as {output_image_path}")

except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
# GAYOMETER

input_image_path = "templates_unwrapped/18.png"
output_image_path = "output_unwrapped/11.png"

# Open the image
image = Image.open(input_image_path)

# Create a drawing object
draw = ImageDraw.Draw(image)

# Define the text, font, and position
font_path = 'Recoleta-Bold.ttf'#"GothamBold.ttf"  # Replace with the path to your desired font file
font_size = 140  # Adjust font size as needed
font = ImageFont.truetype(font_path, font_size)

draw.text((700,380), str(round(sum(item in gay_artists for item in unique_songs_df.artist.tolist())/len(unique_songs_df.artist.tolist())*100))+'%', fill="white", font=font)
         
# Save the edited image
image.save(output_image_path)

print(f"Edited image saved as {output_image_path}")
