<strong><h1>Environmental Setup</h1></strong>

<strong>Install Required Packages:</strong> Install necessary packages using pip.

<strong>Load Environment Variables:</strong> Load environment variables for Spotify API.

In [None]:
%pip install -r C:\Users\ezrag\OneDrive\Documents\GitHub\spotify-listening-data\requirements.txt

In [2]:
import os
from dotenv import load_dotenv # type: ignore

# Load environment variables from .env file
load_dotenv()

SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')

In [52]:
# Import necessary libraries
import ast
import json
import os
import queue
import random
import threading
import time
from datetime import datetime, timedelta
from urllib.parse import quote

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from dotenv import load_dotenv
from itertools import combinations 
from collections import Counter

<strong><h1>Utility Functions and Initialization</h1></strong>

<h3>User Input Functions</h3>
<strong>Get User ID:</strong> Function to get user ID from input.

<strong>Get Number of Data Chunks:</strong> Function to get the number of data chunks from input.

In [4]:
def get_user_id():
    """
    Prompt the user to enter their ID and return it in lowercase.

    This function prompts the user to enter their ID, converts it to lowercase,
    and returns the result.

    Returns:
    str: The user's ID in lowercase.
    """
    user_id = input("Enter the user's ID: ").lower()
    return user_id

def get_num_chunks():
    """
    Prompt the user to enter the number of data chunks.

    This function prompts the user to enter the number of data chunks,
    converts the input to an integer, and returns the result.

    Returns:
    int: The number of data chunks entered by the user.
    """
    num_chunks = int(input("Enter the number of chunks: "))
    return num_chunks

<h3>Data Reading and Processing</h3>
<strong>Read and Process Data:</strong> Function to read and process data from multiple JSON files.

<strong>Export Data to CSV:</strong> Function to export processed data to a CSV file.

<strong>Track Unique Songs:</strong> Function to track unique songs and update unique songs list.

<strong>Safe Literal Eval:</strong> Function to safely evaluate literals from strings.

<strong>Expand Artists Involved:</strong> Function to expand artists involved in each track.

In [5]:
def read_and_process_data(user_id, num_chunks, base_path='wrapped_files/'):
    """
    Read and process data from multiple JSON files.

    This function reads data from multiple JSON files specified by the user ID
    and number of chunks, processes the data, and returns it as a pandas DataFrame.

    Parameters:
    user_id (str): The user's ID.
    num_chunks (int): The number of JSON files (chunks) to read.
    base_path (str, optional): The base path to the directory containing the JSON files. 
                               Defaults to 'wrapped_files/'.

    Returns:
    pandas.DataFrame: A DataFrame containing the processed data.

    Raises:
    ValueError: If no data files were found or all files were empty.
    """
    all_data = []
    
    for i in range(num_chunks):
        json_file = os.path.join(base_path, f'{user_id}_music_{i}.json')
        
        if not os.path.exists(json_file):
            print(f"File not found: {json_file}")
            continue
        
        with open(json_file, 'r', encoding='utf-8') as file:
            data_list = json.load(file)
            all_data.extend(data_list)
    
    if not all_data:
        raise ValueError("No data files were found or all were empty.")
    
    df = pd.DataFrame(all_data)
    df['user_id'] = user_id
    df['endTime'] = pd.to_datetime(df['endTime'])
    
    print(f"Data read successfully for {len(df)} records.")
    return df

def export_to_csv(df, user_id):
    """
    Export data to a CSV file.

    This function exports the provided DataFrame to a CSV file named with the user's ID.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data to be exported.
    user_id (str): The user's ID.

    Returns:
    None
    """
    csv_file = f'{user_id}_listening_data.csv'
    df.to_csv(csv_file, index=False)
    print(f"Data exported to {csv_file}")

def track_unique_songs(df, unique_songs_file):
    """
    Track unique songs in the given DataFrame.

    This function ensures the DataFrame includes the necessary columns,
    drops duplicates within the current DataFrame, and combines new unique songs
    with existing unique songs from a CSV file. The combined unique songs are then
    saved back to the CSV file.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data to be processed.
    unique_songs_file (str): The file path to the CSV file where unique songs are stored.

    Returns:
    None
    """
    # Ensure DataFrame includes necessary columns
    required_columns = ['trackName', 'artistName', 'external_urls']
    for col in required_columns:
        if col not in df.columns:
            df[col] = None
    
    # Drop duplicates within the current DataFrame
    new_unique_songs = df[required_columns].drop_duplicates()
    print(f"Tracking {len(new_unique_songs)} unique songs.")
    
    try:
        # Attempt to load existing unique songs from the CSV file
        existing_unique_songs = pd.read_csv(unique_songs_file)
        print(f"Loaded {len(existing_unique_songs)} existing unique songs.")
    except FileNotFoundError:
        # If the file does not exist, start with an empty DataFrame
        existing_unique_songs = pd.DataFrame(columns=required_columns)
        print("No existing unique songs file found. Starting fresh.")
    
    # Combine new and existing unique songs
    combined_unique_songs = pd.concat([existing_unique_songs, new_unique_songs]).drop_duplicates()
    
    # Save the combined DataFrame to the CSV file
    combined_unique_songs.to_csv(unique_songs_file, index=False)
    print(f"Updated unique songs saved to {unique_songs_file}.")


def safe_literal_eval(val):
    """
    Safely evaluate a string containing a Python literal or container display.

    This function attempts to safely evaluate a string containing a Python literal
    or container display (e.g., list, dictionary). If the evaluation fails due to
    a ValueError or SyntaxError, the original value is returned.

    Parameters:
    val (str): The string to be evaluated.

    Returns:
    object: The evaluated Python object, or the original value if evaluation fails.
    """
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val

def expand_artists_involved(df):
    """
    Expand and standardize the list of artists involved in each track.

    This function processes the 'artists_involved' column to ensure it is correctly evaluated as a list,
    adds the main artist to this list if not already present, standardizes the artist names to lowercase,
    and preserves the original artist names.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'artists_involved' and 'artistName' columns.

    Returns:
    pandas.DataFrame: Updated DataFrame with expanded and standardized artist names.
    """
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original

    def safe_literal_eval(val):
        try:
            if isinstance(val, str):
                return literal_eval(val)
            return val
        except (ValueError, SyntaxError):
            return []

    def add_main_artist(row):
        if isinstance(row['artists_involved'], list):
            if row['artistName'] not in row['artists_involved']:
                row['artists_involved'].append(row['artistName'])
        else:
            row['artists_involved'] = [row['artistName']]
        return row

    df_copy = df_copy.apply(add_main_artist, axis=1)
    
    df_copy['standardized_artists'] = df_copy['artists_involved'].apply(lambda x: [artist.lower() for artist in x])
    df_copy['standardized_artists_str'] = df_copy['standardized_artists'].apply(lambda x: ', '.join(x))  # Convert lists to strings
    df_copy['original_artists'] = df_copy['artists_involved']  # Preserve original names

    return df_copy



def check_album_art_exists(artist_name, folder='albums'):
    """
    Check if the album art for a given artist exists in the specified folder.

    Parameters:
    artist_name (str): The name of the artist.
    folder (str): The folder to check for album art. Defaults to 'albums'.

    Returns:
    bool: True if album art exists, False otherwise.
    """
    filename = f"{folder}/{artist_name}.jpg"
    return os.path.isfile(filename)

def fetch_album_art(artist_name, token, folder='albums'):
    """
    Fetch album art for a given artist from Spotify API and save it to the specified folder.

    Parameters:
    artist_name (str): The name of the artist.
    token (str): Spotify API access token.
    folder (str): The folder to save the album art. Defaults to 'albums'.
    """
    search_url = f'https://api.spotify.com/v1/search?q={artist_name}&type=artist&limit=1'
    headers = {'Authorization': f'Bearer {token}'}
    response = requests.get(search_url, headers=headers)
    data = response.json()

    if data['artists']['items']:
        artist_info = data['artists']['items'][0]
        if artist_info['images']:
            image_url = artist_info['images'][0]['url']
            image_response = requests.get(image_url)
            with open(f"{folder}/{artist_name}.jpg", 'wb') as f:
                f.write(image_response.content)
            print(f"Album art for {artist_name} saved.")
        else:
            print(f"No album art found for {artist_name}.")
    else:
        print(f"No artist found for {artist_name}.")

def ensure_album_art(artist_name, folder='albums'):
    """
    Ensure album art for a given artist exists, either by checking locally or fetching from Spotify API.

    Parameters:
    artist_name (str): The name of the artist.
    folder (str): The folder to check and save album art. Defaults to 'albums'.
    """
    if not check_album_art_exists(artist_name, folder):
        token = get_spotify_access_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)
        fetch_album_art(artist_name, token, folder)


<h3>Spotify API Integration</h3>
<strong>Get Spotify Access Token:</strong> Function to obtain Spotify access token using client credentials.

<strong>Get Song Details:</strong> Function to retrieve song details from Spotify API.

In [6]:
def get_spotify_access_token(client_id, client_secret):
    """
    Get Spotify access token using client credentials.

    This function sends a request to the Spotify API to get an access token 
    using the client credentials (client ID and client secret). The token is 
    required for further API requests.

    Parameters:
    client_id (str): The Spotify client ID.
    client_secret (str): The Spotify client secret.

    Returns:
    str: The access token used for further API requests.
    """
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
    })
    
    # Parse the authentication response and extract access token
    auth_response_data = auth_response.json()
    return auth_response_data['access_token']

def get_song_details(artist_name, track_name, access_token):
    """
    Get song details from Spotify API using search query.

    This function sends a search request to the Spotify API using the given 
    artist name and track name. It retrieves detailed information about the 
    song including album, release date, popularity, duration, track number, 
    album artwork, external URLs, artists involved, and genres.

    Parameters:
    artist_name (str): The name of the artist.
    track_name (str): The name of the track.
    access_token (str): The Spotify API access token.

    Returns:
    dict: A dictionary containing detailed information about the song. 
          None if no track is found.
    """
    search_url = 'https://api.spotify.com/v1/search'
    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    query = f'artist:{quote(artist_name)} track:{quote(track_name)}'
    params = {
        'q': query,
        'type': 'track',
        'limit': 1
    }
    
    # Send request to Spotify API to search for the track
    response = requests.get(search_url, headers=headers, params=params)
    response_data = response.json()
    
    
    if 'tracks' in response_data and response_data['tracks']['items']:
        track_info = response_data['tracks']['items'][0]
        
        # Get artist details to fetch genres in batches
        artist_ids = [artist['id'] for artist in track_info['artists']]
        artist_genres = []

        batch_size = 50
        for i in range(0, len(artist_ids), batch_size):
            batch_ids = artist_ids[i:i + batch_size]
            artist_url = f"https://api.spotify.com/v1/artists?ids={','.join(batch_ids)}"
            retries = 5
            delay = 1
            
            while retries > 0:
                artist_response = requests.get(artist_url, headers=headers)
                
                if artist_response.status_code == 200:
                    try:
                        artist_data = artist_response.json()['artists']
                        for artist in artist_data:
                            if 'genres' in artist:
                                artist_genres.extend(artist['genres'])
                        break  # Exit the retry loop if successful
                    except ValueError as e:
                        print(f"Error decoding JSON for batch {batch_ids}: {e}")
                elif artist_response.status_code == 429:
                    retry_after = int(artist_response.headers.get('Retry-After', delay))
                    print(f"Rate limited. Retrying after {retry_after} seconds.")
                    time.sleep(retry_after)
                else:
                    print(f"Request failed with status code {artist_response.status_code}")
                
                retries -= 1
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            
            if retries == 0:
                print(f"Failed to fetch genres for batch {batch_ids} after {retries} attempts.")
        
        # Ensure the genres list is ordered and unique
        artist_genres = sorted(set(artist_genres))
        
        # Check if album images and external URLs are present
        album_artwork = track_info['album']['images'][0]['url'] if 'images' in track_info['album'] and track_info['album']['images'] else None
        external_urls = track_info['external_urls']['spotify'] if 'external_urls' in track_info else None
        
        song_details = {
            'spotify_id': track_info['id'],
            'album': track_info['album']['name'],
            'release_date': track_info['album']['release_date'],
            'popularity': track_info['popularity'],
            'duration_ms': track_info['duration_ms'],
            'track_number': track_info['track_number'],
            'album_artwork': album_artwork,
            'external_urls': external_urls,
            'artists_involved': [artist['name'] for artist in track_info['artists']],
            'genres': artist_genres
        }
        
        print("Song details:", song_details)
        return song_details
    else:
        print("No tracks found for the given query.")
        return None

<h3>Multi-Threading for Data Processing</h3>
<strong>Worker Thread:</strong> Function for worker threads to process each song in the queue.

<strong>Update Unique Songs:</strong> Function to update the unique songs table with Spotify information using threading.

In [7]:
def worker_thread(queue, unique_songs, unique_songs_file, access_token, export_interval, lock, start_time):
    """
    Worker function to process each song in the queue.

    This function processes each song in the queue by fetching song details 
    from the Spotify API and updating the unique songs DataFrame. It also 
    periodically exports the updated DataFrame to a CSV file.

    Parameters:
    queue (Queue): The queue containing songs to be processed.
    unique_songs (DataFrame): The DataFrame of unique songs.
    unique_songs_file (str): The file path for the unique songs CSV.
    access_token (str): The Spotify API access token.
    export_interval (int): The interval at which the DataFrame is exported to the CSV file.
    lock (Lock): The lock to ensure thread-safe operations.
    start_time (float): The start time of the processing.

    Returns:
    None
    """
    while not queue.empty():
        index, row = queue.get()
        if pd.notna(row['spotify_id']):
            queue.task_done()
            continue
        
        artist_name = row['artistName']
        track_name = row['trackName']
        song_details = get_song_details(artist_name, track_name, access_token)
        
        if song_details:
            with lock:
                unique_songs.at[index, 'spotify_id'] = song_details['spotify_id']
                unique_songs.at[index, 'album'] = song_details['album']
                unique_songs.at[index, 'release_date'] = song_details['release_date']
                unique_songs.at[index, 'popularity'] = song_details['popularity']
                unique_songs.at[index, 'duration_ms'] = song_details['duration_ms']
                unique_songs.at[index, 'track_number'] = song_details['track_number']
                unique_songs.at[index, 'album_artwork'] = song_details['album_artwork']
                unique_songs.at[index, 'external_urls'] = song_details['external_urls']
                unique_songs.at[index, 'artists_involved'] = song_details['artists_involved']
                unique_songs.at[index, 'genres'] = song_details['genres']
        
        if (index + 1) % export_interval == 0:
            with lock:
                print(f"Exporting data at index {index}. Elapsed time: {time.time() - start_time:.2f} seconds.")
                unique_songs.to_csv(unique_songs_file, index=False)
        
        queue.task_done()

def update_unique_songs(unique_songs_file='unique_songs.csv', export_interval=50):
    """
    Main function to update unique songs table with Spotify info using threading.

    This function loads the unique songs data, checks for missing columns, and 
    updates the table with Spotify information using multiple threads. The 
    updated table is periodically exported to a CSV file.

    Parameters:
    unique_songs_file (str, optional): The file path for the unique songs CSV. Defaults to 'unique_songs.csv'.
    export_interval (int, optional): The interval at which the DataFrame is exported to the CSV file. Defaults to 50.

    Returns:
    None
    """
    # Load unique songs data from CSV file
    print(f"Loading unique songs from {unique_songs_file}")
    unique_songs = pd.read_csv(unique_songs_file)
    print(f"Loaded {len(unique_songs)} unique songs")

    # Check if the columns already exist, if not, create them
    columns = ['spotify_id', 'album', 'release_date', 'popularity', 'duration_ms', 'track_number', 'album_artwork', 'external_urls', 'artists_involved', 'genre']
    for column in columns:
        if column not in unique_songs.columns:
            unique_songs[column] = None

    # Get Spotify access token
    access_token = get_spotify_access_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)
    
    # Create a queue and add songs to be processed
    q = queue.Queue()
    for index, row in unique_songs.iterrows():
        q.put((index, row))

    # Create a lock for thread-safe operations
    lock = threading.Lock()
    start_time = time.time()
    threads = []
    for _ in range(10):  # Adjust number of threads as needed
        thread = threading.Thread(target=worker_thread, args=(q, unique_songs, unique_songs_file, access_token, export_interval, lock, start_time))
        thread.start()
        threads.append(thread)
    
    # Wait for all threads to complete
    for thread in threads:
        thread.join()
    
    # Remove duplicates based on 'external_urls'
    unique_songs = drop_duplicates_by_external_urls(unique_songs)
    
    # Final export
    print(f"Final export. Total time taken: {time.time() - start_time:.2f} seconds.")
    unique_songs.to_csv(unique_songs_file, index=False)
    print(f"Unique songs table updated with Spotify info and saved to {unique_songs_file}.")




<h3>Data Deduplication and Filling</h3>
<strong>Drop Duplicates:</strong> Function to drop duplicate songs based on external URLs.

<strong>Fill Song Info:</strong> Function to fill in song information from the unique songs database.

<strong>Read Processed Data:</strong> Function to read processed listening data.

<strong>Export Filled Data:</strong> Function to export filled listening data to a CSV file.

In [8]:
def drop_duplicates_by_external_urls(data):
    """
    This function drops duplicate rows based on the 'external_urls' column, 
    but retains rows where 'external_urls' is blank.
    
    Parameters:
        data (pd.DataFrame): DataFrame containing the song data with 'external_urls' column.
    
    Returns:
        pd.DataFrame: DataFrame with duplicates removed based on 'external_urls'.
    """
    # Identify rows where external_urls is not blank
    non_blank_urls = data['external_urls'].notna()
    
    # Drop duplicates only where external_urls is not blank
    data_non_blank = data[non_blank_urls].drop_duplicates(subset=['external_urls'])
    
    # Combine the non-duplicated rows with the rows where external_urls is blank
    data_final = pd.concat([data_non_blank, data[~non_blank_urls]], ignore_index=True)
    
    return data_final

def fill_song_info(listening_data, unique_songs):
    """
    Fill in song information from the unique songs database.

    This function filters out rows with 'unknown' artists in the listening data,
    then merges the listening data with the unique songs database on 'artistName'
    and 'trackName' to fill in additional song information.

    Parameters:
    listening_data (pandas.DataFrame): DataFrame containing the user's listening data.
    unique_songs (pandas.DataFrame): DataFrame containing the unique songs database.

    Returns:
    pandas.DataFrame: DataFrame with filled in song information.
    """
    # Filter out rows where artistName is 'unknown'
    listening_data_filtered = listening_data[~listening_data['artistName'].str.lower().isin(['unknown', 'unknown artist'])]
    # Merge listening data with unique songs data on 'artistName' and 'trackName'
    filled_data = pd.merge(listening_data_filtered, unique_songs, on=['artistName', 'trackName'], how='left')
    return filled_data

def read_processed_data(user_id):
    """
    Read processed listening data from a CSV file.

    This function reads a CSV file containing the user's processed listening data
    and returns it as a pandas DataFrame.

    Parameters:
    user_id (str): The user's ID.

    Returns:
    pandas.DataFrame: DataFrame containing the user's processed listening data.
    """
    csv_file = f'{user_id}_listening_data.csv'  # Example file path, adjust as needed
    listening_data = pd.read_csv(csv_file)
    return listening_data

def export_filled_data(filled_data, user_id):
    """
    Export filled listening data to a CSV file.

    This function exports the provided DataFrame containing filled listening data
    to a CSV file named with the user's ID.

    Parameters:
    filled_data (pandas.DataFrame): The DataFrame containing the filled listening data.
    user_id (str): The user's ID.

    Returns:
    None
    """
    filled_csv_file = f'{user_id}_listening_data.csv'
    filled_data.to_csv(filled_csv_file, index=False)
    print(f"Filled listening data exported to {filled_csv_file}")

<strong><h1>Data Analysis & Visualization</h1></strong>
<h3>Listening Time Analysis</h3>

In [9]:
def total_listening_time_per_user(df):
    """
    Calculate total listening time per user.

    This function calculates the total listening time for each user by summing
    the 'msPlayed' column grouped by 'user_id' and converting the time from
    milliseconds to hours.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'user_id' and 'msPlayed' columns.

    Returns:
    pandas.Series: Series containing the total listening time per user in hours.
    """
    df_copy = df.copy()
    total_time = df_copy.groupby('user_id')['msPlayed'].sum()
    total_time_hours = total_time / (1000 * 60 * 60)  # Convert milliseconds to hours
    return total_time_hours


def biggest_listening_date(df):
    """
    Identify the biggest listening date and the total minutes listened to on that date.

    This function converts the 'endTime' column to datetime, groups the data by date,
    sums the listening time ('msPlayed') for each date, and identifies the date with
    the highest total listening time and the total listening time in minutes.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' and 'msPlayed' columns.

    Returns:
    tuple: A tuple containing the biggest listening date and the total listening time on that date in minutes.
    """
    df_copy = df.copy()
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    
    # Group by date and calculate the total listening time in milliseconds
    total_listening_time_per_date = df_copy.groupby(df_copy['endTime'].dt.date)['msPlayed'].sum()
    
    # Identify the date with the highest total listening time
    biggest_date = total_listening_time_per_date.idxmax()
    
    # Calculate the total listening time on that date in minutes
    total_minutes_on_biggest_date = total_listening_time_per_date.max() / (1000 * 60)
    
    return biggest_date, total_minutes_on_biggest_date


In [10]:
def calculate_unique_counts(df):
    """
    Calculate the number of unique songs, artists (using the expanded method), and albums.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data.

    Returns:
    dict: Dictionary with counts of unique songs, artists, and albums.
    """
    print("Calculating unique counts")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original

    # Unique songs
    unique_songs = df_copy['trackName'].nunique()

    # Expand artists involved and calculate unique artists
    df_copy = expand_artists_involved(df_copy)
    unique_artists = df_copy['standardized_artists'].explode().nunique()

    # Unique albums
    unique_albums = df_copy['album'].nunique()

    return {
        'unique_songs': unique_songs,
        'unique_artists': unique_artists,
        'unique_albums': unique_albums
    }


<h3>Artist Analysis</h3>

In [11]:
def top_artists_by_time(df, top_n=10):
    """
    Calculate top listened-to artists by listening time.

    This function expands the artists involved in each track, calculates the total
    listening time for each artist by summing the 'msPlayed' column grouped by 
    'standardized_artists', and returns the top N artists based on their total 
    listening time, converted to seconds.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'standardized_artists' and 'msPlayed' columns.
    top_n (int, optional): Number of top artists to return. Defaults to 10.

    Returns:
    pandas.Series: Series containing the total listening time per artist in seconds, sorted in descending order.
    """
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_expanded = df_copy.explode('standardized_artists')
    
    # Create a dictionary to map standardized artists to original artists
    original_artists_dict = df_expanded.set_index('standardized_artists')['artistName'].to_dict()
    
    artist_time = df_expanded.groupby('standardized_artists')['msPlayed'].sum().sort_values(ascending=False).head(top_n)
    
    # Map the standardized artists back to the original artists
    artist_time.index = artist_time.index.map(original_artists_dict)
    
    return artist_time


def top_artists_by_count(df, top_n=10):
    """
    Calculate top listened-to artists by count.

    This function expands the artists involved in each track, calculates the count
    of occurrences for each artist using the 'standardized_artists' column, and returns
    the top N artists based on their count.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'standardized_artists' column.
    top_n (int, optional): Number of top artists to return. Defaults to 10.

    Returns:
    pandas.Series: Series containing the count of occurrences per artist, sorted in descending order.
    """
    print(f"Calculating top {top_n} artists by count")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_expanded = df_copy.explode('standardized_artists')
    
    # Create a dictionary to map standardized artists to the most common original artist name
    original_artists_dict = df_expanded.groupby('standardized_artists')['artistName'].agg(lambda x: x.value_counts().idxmax()).to_dict()
    
    artist_count = df_expanded['standardized_artists'].value_counts().head(top_n)
    
    # Map the standardized artists back to the most common original artists
    artist_count.index = artist_count.index.map(original_artists_dict)
    
    return artist_count



def top_artists_by_weighted_time(df, top_n=10):
    """
    Calculate the top artists based on weighted listening time.

    Parameters:
    df (pd.DataFrame): DataFrame containing artist data.
    top_n (int): Number of top artists to return. Default is 10.

    Returns:
    pd.DataFrame: DataFrame containing the top artists by weighted listening time.
    """
    print(f"Calculating top {top_n} artists by weighted listening time")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_expanded = df_copy.explode('standardized_artists')
    
    # Create a dictionary to map standardized artists to the most common original artist name
    original_artists_dict = df_expanded.groupby('standardized_artists')['artistName'].agg(lambda x: x.value_counts().idxmax()).to_dict()
    
    weighted_times = {}

    for index, row in df_expanded.iterrows():
        artist_name = row['artistName'].lower()
        current_artist = row['standardized_artists']
        percentage_listened = row['percentage_listened']
        duration_ms = row['duration_ms']
        
        # Skip if percentage_listened or duration_ms is NaN
        if pd.isna(percentage_listened) or pd.isna(duration_ms):
            continue
        
        # Add check for empty or suspicious artist names
        if not current_artist or not isinstance(current_artist, str):
            continue
        
        # Snap the percentage_listened to a maximum of 1
        percentage_listened = min(percentage_listened / 100, 1)
        
        # Calculate actual listened time
        listened_time = percentage_listened * duration_ms
        
        if artist_name in current_artist:
            main_artist_weight = 0.5 * listened_time
            other_artists_weight = 0.5 * listened_time / (len(row['artists_involved']) - 1) if len(row['artists_involved']) > 1 else 0
            
            # Add weight to main artist
            if artist_name not in weighted_times:
                weighted_times[artist_name] = 0
            weighted_times[artist_name] += main_artist_weight
            
            # Add weight to other artists
            for artist in row['artists_involved']:
                if artist.lower() != artist_name:
                    if artist.lower() not in weighted_times:
                        weighted_times[artist.lower()] = 0
                    weighted_times[artist.lower()] += other_artists_weight
        else:
            equal_weight = listened_time / len(row['artists_involved'])
            for artist in row['artists_involved']:
                if artist.lower() not in weighted_times:
                    weighted_times[artist.lower()] = 0
                weighted_times[artist.lower()] += equal_weight
    
    # Convert to a DataFrame for easy sorting and selection
    weighted_times_df = pd.DataFrame.from_dict(weighted_times, orient='index', columns=['weighted_time'])
    top_artists_weighted_time = weighted_times_df.sort_values(by='weighted_time', ascending=False).head(top_n)
    
    # Map the standardized artists back to the most common original artists
    top_artists_weighted_time.index = top_artists_weighted_time.index.map(original_artists_dict)
    
    return top_artists_weighted_time

def top_artists_by_weighted_count(df, top_n=10):
    """
    Calculate the top artists based on weighted listen counts.

    Parameters:
    df (pd.DataFrame): DataFrame containing artist data.
    top_n (int): Number of top artists to return. Default is 10.

    Returns:
    pd.DataFrame: DataFrame containing the top artists by weighted listens.
    """
    print(f"Calculating top {top_n} artists by weighted listen counts")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_expanded = df_copy.explode('standardized_artists')
    
    # Create a dictionary to map standardized artists to the most common original artist name
    original_artists_dict = df_expanded.groupby('standardized_artists')['artistName'].agg(lambda x: x.value_counts().idxmax()).to_dict()
    
    weighted_listens = {}

    for index, row in df_expanded.iterrows():
        artist_name = row['artistName'].lower()
        current_artist = row['standardized_artists']
        percentage_listened = row['percentage_listened']
        
        # Skip if percentage_listened is NaN
        if pd.isna(percentage_listened):
            continue
        
        # Add check for empty or suspicious artist names
        if not current_artist or not isinstance(current_artist, str):
            continue
        
        # Snap the percentage_listened to a maximum of 1
        percentage_listened = min(percentage_listened / 100, 1)
        
        if artist_name in current_artist:
            main_artist_weight = 0.5 * percentage_listened
            other_artists_weight = 0.5 * percentage_listened / (len(row['artists_involved']) - 1) if len(row['artists_involved']) > 1 else 0
            
            # Add weight to main artist
            if artist_name not in weighted_listens:
                weighted_listens[artist_name] = 0
            weighted_listens[artist_name] += main_artist_weight
            
            # Add weight to other artists
            for artist in row['artists_involved']:
                if artist.lower() != artist_name:
                    if artist.lower() not in weighted_listens:
                        weighted_listens[artist.lower()] = 0
                    weighted_listens[artist.lower()] += other_artists_weight
        else:
            equal_weight = percentage_listened / len(row['artists_involved'])
            for artist in row['artists_involved']:
                if artist.lower() not in weighted_listens:
                    weighted_listens[artist.lower()] = 0
                weighted_listens[artist.lower()] += equal_weight
    
    # Convert to a DataFrame for easy sorting and selection
    weighted_listens_df = pd.DataFrame.from_dict(weighted_listens, orient='index', columns=['weighted_listens'])
    top_artists_weighted_listens = weighted_listens_df.sort_values(by='weighted_listens', ascending=False).head(top_n)
    
    # Map the standardized artists back to the most common original artists
    top_artists_weighted_listens.index = top_artists_weighted_listens.index.map(original_artists_dict)
    
    return top_artists_weighted_listens



In [76]:
def top_artists_by_genre(df, genre, top_n=5):
    """
    Identify top artists within a specific genre.

    This function filters the DataFrame by genre, sums the listening time ('minutesPlayed') for each artist,
    and identifies the top artists within the genre based on the listening time.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'genres', 'artistName', and 'msPlayed' columns.
    genre (str): The genre to filter by.
    top_n (int, optional): Number of top artists to return for the specified genre. Default is 5.

    Returns:
    list: List of the top artists within the specified genre based on listening time.
    """
    print(f"analyzing top artists for genre: {genre}")
    df['genres'] = df['genres'].replace({pd.NA: '[]'})
    
    def parse_genres(genres):
        try:
            return ast.literal_eval(genres)
        except (ValueError, SyntaxError):
            return []
    
    df['genres'] = df['genres'].apply(parse_genres)
    df_genre = df[df['genres'].apply(lambda x: genre in x)]
    
    # Convert msPlayed to minutes
    df_genre['minutesPlayed'] = df_genre['msPlayed'] / (1000 * 60)
    
    top_artists = df_genre.groupby('artistName')['minutesPlayed'].sum().nlargest(top_n).index.tolist()
    return top_artists


def artist_attention_span(df):
    """
    Calculate the attention span for different artists.

    This function calculates the average percentage of each track listened to before skipping for each artist,
    capping the percentage at 100 if it exceeds 100. It also returns the total listening time for each artist.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'artistName', 'duration_ms', and 'percentage_listened' columns.

    Returns:
    tuple: Series containing the average percentage of tracks listened to for each artist, and a Series with the total listening time for each artist in minutes.
    """
    print("Calculating music taste attention span for artists")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original

    # Cap the percentage_listened values at 100
    df_copy['percentage_listened'] = df_copy['percentage_listened'].apply(lambda x: min(x, 100))
    
    valid_entries = df_copy.dropna(subset=['duration_ms', 'percentage_listened'])
    
    # Ensure standardized_artists is a string
    valid_entries['standardized_artists_str'] = valid_entries['standardized_artists'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    
    # Create a dictionary to map standardized artists to the most common original artist name
    original_artists_dict = valid_entries.groupby('standardized_artists_str')['artistName'].agg(lambda x: x.value_counts().idxmax()).to_dict()
    
    # Calculate average percentage listened for each artist
    artist_span = valid_entries.groupby('standardized_artists_str')['percentage_listened'].mean().sort_values(ascending=False)
    
    # Map the standardized artists back to the most common original artists
    artist_span.index = artist_span.index.map(original_artists_dict)
    
    # Calculate total listening time for each artist
    valid_entries.loc[:, 'listened_time'] = valid_entries['duration_ms'] * (valid_entries['percentage_listened'] / 100)
    artist_listened_time = valid_entries.groupby('standardized_artists_str')['listened_time'].sum() / (1000 * 60)  # Convert milliseconds to minutes
    
    # Map the standardized artists back to the most common original artists
    artist_listened_time.index = artist_listened_time.index.map(original_artists_dict)
    
    return artist_span, artist_listened_time

def artist_diversity_growth(df, distinguish_years=True):
    """
    Calculate the growth in artist diversity over a period of time.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per artist involved in each track,
    groups by month, and calculates the percentage growth in the number of distinct artists listened to between each month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' and 'standardized_artists' columns.
    distinguish_years (bool, optional): Whether to distinguish between months of different years. Default is True.

    Returns:
    pandas.Series: Series containing the percentage growth in distinct artists per month.
    """
    print("Calculating artist diversity growth")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_expanded = df_copy.explode('standardized_artists')
    
    # Ensure standardized_artists is a string
    df_expanded['standardized_artists_str'] = df_expanded['standardized_artists'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    
    # Create a dictionary to map standardized artists to the most common original artist name
    original_artists_dict = df_expanded.groupby('standardized_artists_str')['artistName'].agg(lambda x: x.value_counts().idxmax()).to_dict()
    
    if distinguish_years:
        df_expanded['year_month'] = df_expanded['endTime'].dt.to_period('M')
        artist_diversity = df_expanded.groupby('year_month')['standardized_artists_str'].nunique()
    else:
        df_expanded['month'] = df_expanded['endTime'].dt.month
        artist_diversity = df_expanded.groupby('month')['standardized_artists_str'].nunique()
    
    artist_diversity_growth = artist_diversity.pct_change().fillna(0) * 100
    
    if not distinguish_years:
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        artist_diversity_growth.index = artist_diversity_growth.index.map(month_names)

    return artist_diversity_growth


In [88]:
import pandas as pd
from itertools import combinations
from collections import Counter
import ast

def top_collaborating_artists(df, top_n=10):
    """
    Identify the top collaborating artists based on the 'artists_involved' column.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'artists_involved' column.
    top_n (int): Number of top collaborating artist pairs to return. Default is 10.

    Returns:
    pandas.DataFrame: DataFrame containing the top collaborating artist pairs and their collaboration counts.
    """
    print("Identifying top collaborating artists")

    def normalize_name(name):
        """ Normalize the artist name to a standard format. """
        return name.strip().lower()
    
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    print(df_copy['artists_involved'])
    df_copy['artists_involved'] = df_copy['artists_involved'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Create pairs of collaborating artists
    pairs = []
    for artists in df_copy['artists_involved']:
        if len(artists) > 1:
            for pair in combinations(artists, 2):
                pairs.append(pair)
    
    # Count collaborations
    pair_counts = Counter(pairs)
    
    # Map back to original names and normalize
    original_pairs = []
    for pair, count in pair_counts.items():
        normalized_pair = tuple(sorted(normalize_name(artist) for artist in pair))
        original_pair = tuple(pair)
        original_pairs.append((original_pair, count))
    
    top_collaborating_artists_df = pd.DataFrame(original_pairs, columns=['artist_pair', 'collaboration_count']).sort_values(by='collaboration_count', ascending=False).head(top_n)
    
    # Debug print: Check the final DataFrame
    print("Top collaborating artists DataFrame:", top_collaborating_artists_df)
    
    return top_collaborating_artists_df


<h3>Music Taste and Habits Analysis</h3>

In [40]:
from ast import literal_eval

def top_weighted_artists_per_month(df, distinguish_years=True):
    """
    Analyze top 5 weighted artists per month, with an option to distinguish between years.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per artist involved in each track,
    groups by month and artist, calculates the weighted listening time ('msPlayed' * 'percentage_listened'), and identifies the top 5 artists per month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime', 'artistName', 'artists_involved', 'msPlayed', and 'percentage_listened' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    dict: Dictionary with months (and optionally years) as keys and lists of the top 5 artists for each month by weighted listening time.
    """
    print("Analyzing top weighted artists per month")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_copy['artists_involved'] = df_copy['artists_involved'].replace({pd.NA: '[]'})
    
    def parse_artists(artists):
        try:
            return literal_eval(artists)
        except (ValueError, SyntaxError):
            return []
    
    df_copy['artists_involved'] = df_copy['artists_involved'].apply(parse_artists)
    df_copy['weighted_listening_time'] = df_copy['msPlayed'] * df_copy['percentage_listened'] / 100
    
    def add_main_artist(row):
        if row['artistName'] not in row['artists_involved']:
            row['artists_involved'].append(row['artistName'])
        return row
    
    df_copy = df_copy.apply(add_main_artist, axis=1)
    df_expanded = df_copy.explode('artists_involved')
    
    # Ensure artists_involved is a string
    df_expanded['artists_involved_str'] = df_expanded['artists_involved'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    
    # Create a dictionary to map standardized artists to the most common original artist name
    original_artists_dict = df_expanded.groupby('artists_involved_str')['artistName'].agg(lambda x: x.value_counts().idxmax()).to_dict()
    
    top_artists_per_month = {}
    if distinguish_years:
        df_expanded['year_month'] = df_expanded['endTime'].dt.to_period('M')
        for period in df_expanded['year_month'].unique():
            monthly_df = df_expanded[df_expanded['year_month'] == period]
            top_artists = monthly_df.groupby('artists_involved_str')['weighted_listening_time'].sum().nlargest(5)
            top_artists_per_month[period] = top_artists.index.tolist()
        formatted_output = {period.strftime('%B %Y'): [original_artists_dict[artist] for artist in artists] for period, artists in top_artists_per_month.items()}
    else:
        for month in df_expanded['endTime'].dt.month.unique():
            monthly_df = df_expanded[df_expanded['endTime'].dt.month == month]
            top_artists = monthly_df.groupby('artists_involved_str')['weighted_listening_time'].sum().nlargest(5)
            top_artists_per_month[month] = top_artists.index.tolist()
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        formatted_output = {month_names[month]: [original_artists_dict[artist] for artist in artists] for month, artists in top_artists_per_month.items()}

    return formatted_output



def top_songs_by_plays_per_month(df, distinguish_years=True):
    """
    Analyze top 5 songs by unweighted plays per month, with an option to distinguish between years.

    This function converts the 'endTime' column to datetime, groups by month (and optionally year) and track,
    counts the number of plays for each track, and identifies the top 5 songs per month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' and 'trackName' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    dict: Dictionary with months (and optionally years) as keys and lists of the top 5 tracks for each month by unweighted plays.
    """
    print("Analyzing top songs by unweighted plays per month")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    
    top_songs_per_month = {}
    if distinguish_years:
        df_copy['year_month'] = df_copy['endTime'].dt.to_period('M')
        for period in df_copy['year_month'].unique():
            monthly_df = df_copy[df_copy['year_month'] == period]
            top_songs = monthly_df['trackName'].value_counts().nlargest(5)
            top_songs_per_month[period] = top_songs.index.tolist()
        formatted_output = {period.strftime('%B %Y'): songs for period, songs in top_songs_per_month.items()}
    else:
        for month in df_copy['endTime'].dt.month.unique():
            monthly_df = df_copy[df_copy['endTime'].dt.month == month]
            top_songs = monthly_df['trackName'].value_counts().nlargest(5)
            top_songs_per_month[month] = top_songs.index.tolist()
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        formatted_output = {month_names[month]: songs for month, songs in top_songs_per_month.items()}

    return formatted_output

def top_songs_by_weighted_time_per_month(df, distinguish_years=True):
    """
    Analyze top 5 songs by weighted listening time per month, with an option to distinguish between years.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per artist involved in each track,
    groups by month and track, calculates the weighted listening time ('msPlayed' * 'percentage_listened'), and identifies the top 5 songs per month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime', 'trackName', 'msPlayed', and 'percentage_listened' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    dict: Dictionary with months (and optionally years) as keys and lists of the top 5 tracks for each month by weighted listening time.
    """
    print("Analyzing top songs by weighted listening time per month")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_copy['weighted_listening_time'] = df_copy['msPlayed'] * df_copy['percentage_listened'] / 100
    
    top_songs_per_month = {}
    if distinguish_years:
        df_copy['year_month'] = df_copy['endTime'].dt.to_period('M')
        for period in df_copy['year_month'].unique():
            monthly_df = df_copy[df_copy['year_month'] == period]
            top_songs = monthly_df.groupby('trackName')['weighted_listening_time'].sum().nlargest(5)
            top_songs_per_month[period] = top_songs.index.tolist()
        formatted_output = {period.strftime('%B %Y'): songs for period, songs in top_songs_per_month.items()}
    else:
        for month in df_copy['endTime'].dt.month.unique():
            monthly_df = df_copy[df_copy['endTime'].dt.month == month]
            top_songs = monthly_df.groupby('trackName')['weighted_listening_time'].sum().nlargest(5)
            top_songs_per_month[month] = top_songs.index.tolist()
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        formatted_output = {month_names[month]: songs for month, songs in top_songs_per_month.items()}

    return formatted_output




def monthly_listening_patterns(df, distinguish_years=True):
    """
    Analyze monthly listening patterns, with an option to distinguish between years.

    This function converts the 'endTime' column to datetime, groups the DataFrame by month,
    and calculates the total listening duration for each month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' and 'msPlayed' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    pandas.Series: Series containing the total listening duration for each month in minutes.
    """
    print("Calculating monthly listening patterns")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    
    if distinguish_years:
        df_copy['year_month'] = df_copy['endTime'].dt.to_period('M')
        monthly_duration = df_copy.groupby('year_month')['msPlayed'].sum()
        # Convert the period index to a string in "Month Year" format
        monthly_duration.index = monthly_duration.index.strftime('%B %Y')
    else:
        df_copy['month'] = df_copy['endTime'].dt.month
        monthly_duration = df_copy.groupby('month')['msPlayed'].sum()
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        monthly_duration.index = monthly_duration.index.map(month_names)
    
    # Convert msPlayed to minutes
    monthly_duration = monthly_duration / (1000 * 60)
    
    return monthly_duration


def track_listening_duration_over_time(df):
    """
    Calculate the total listening duration for each track over different time periods.

    This function converts the 'endTime' column to datetime, groups the DataFrame by track and month,
    and calculates the total listening duration for each track per month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime', 'trackName', and 'msPlayed' columns.

    Returns:
    pandas.DataFrame: DataFrame containing the total listening duration for each track per month in minutes.
    """
    print("Calculating track listening duration over time")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    
    # Group by track and month, and calculate total listening duration
    df_copy['month'] = df_copy['endTime'].dt.to_period('M')
    df_grouped = df_copy.groupby(['trackName', 'month'])['msPlayed'].sum().reset_index()
    
    # Convert msPlayed to minutes
    df_grouped['listening_duration_minutes'] = df_grouped['msPlayed'] / (1000 * 60)
    
    # Sort by 'month' first to ensure chronological order
    df_grouped = df_grouped.sort_values(by='month')
    df_grouped['month_str'] = df_grouped['month'].dt.strftime('%B %Y')


    return df_grouped


def common_listening_days(df):
    """
    Determine the most common listening days and times.

    This function converts the 'endTime' column to datetime, calculates the frequency
    of each day of the week in the listening data, and returns the counts of the most
    common listening days.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' column.

    Returns:
    pandas.Series: Series containing the counts of the most common listening days.
    """
    print("Determining the most common listening days and times")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    common_days = df_copy['endTime'].dt.day_name().value_counts()
    return common_days



def general_attention_span(df):
    """
    Calculate the general attention span for all tracks.

    This function calculates the average percentage of each track listened to before skipping,
    capping the percentage at 100 if it exceeds 100. It also returns the count of times each song was listened to.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'duration_ms' and 'percentage_listened' columns.

    Returns:
    float: Average percentage of tracks listened to before skipping.
    """
    print("Calculating general music taste attention span")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    
    # Cap the percentage_listened values at 100
    df_copy['percentage_listened'] = df_copy['percentage_listened'].apply(lambda x: min(x, 100))
    
    valid_entries = df_copy.dropna(subset=['duration_ms', 'percentage_listened'])
    attention_span = valid_entries['percentage_listened'].mean()
    
    return attention_span


def listening_percentage_categories(df):
    """
    Categorize listening percentages into ranges and count occurrences.

    This function categorizes the percentage of each track listened to before skipping
    into ranges (0-24%, 25-49%, 50-74%, 75-100%) and counts the number of occurrences in each range.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'percentage_listened' column.

    Returns:
    dict: Dictionary containing the counts of occurrences in each range.
    """
    print("Categorizing listening percentages into ranges")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original

    # Cap the percentage_listened values at 100
    df_copy['percentage_listened'] = df_copy['percentage_listened'].apply(lambda x: min(x, 100))

    valid_entries = df_copy.dropna(subset=['percentage_listened'])

    # Define the ranges
    bins = [0, 24, 49, 74, 100]
    labels = ['0-24%', '25-49%', '50-74%', '75-100%']

    # Categorize the percentages into bins
    df_copy['listening_range'] = pd.cut(valid_entries['percentage_listened'], bins=bins, labels=labels, include_lowest=True)

    # Count the occurrences in each range
    range_counts = df_copy['listening_range'].value_counts().sort_index()

    return range_counts.to_dict()


def top_songs_by_listening_time(df, top_n=10, weighted=False):
    """
    Get the top songs for the year by listening time.

    This function calculates the top songs by listening time, either weighted or unweighted, for the specified number of top songs.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'trackName', 'msPlayed', and 'percentage_listened' columns.
    top_n (int): Number of top songs to return. Default is 10.
    weighted (bool): Whether to calculate weighted listening time.

    Returns:
    pandas.DataFrame: DataFrame containing the top songs and their listening times.
    """
    print("Calculating top songs by listening time")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original

    weighted_times = {}

    for index, row in df_copy.iterrows():
        track_name = row['trackName']
        percentage_listened = row['percentage_listened']
        duration_ms = row['duration_ms']
        
        # Skip if percentage_listened or duration_ms is NaN
        if pd.isna(percentage_listened) or pd.isna(duration_ms):
            continue
        
        # Snap the percentage_listened to a maximum of 1
        percentage_listened = min(percentage_listened / 100, 1)
        
        if weighted:
            # Calculate actual listened time
            listened_time = percentage_listened * duration_ms
        else:
            # Calculate unweighted listening time
            listened_time = duration_ms
        
        if track_name not in weighted_times:
            weighted_times[track_name] = 0
        weighted_times[track_name] += listened_time

    # Convert to a DataFrame for easy sorting and selection
    weighted_times_df = pd.DataFrame.from_dict(weighted_times, orient='index', columns=['listening_time'])
    top_songs_listening_time = weighted_times_df.sort_values(by='listening_time', ascending=False).head(top_n)
    
    return top_songs_listening_time


def top_songs_by_listen_totals(df, top_n=10, weighted=False):
    """
    Get the top songs for the year by listen totals.

    This function calculates the top songs by listen totals, either weighted or unweighted, for the specified number of top songs.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'trackName' and 'percentage_listened' columns.
    top_n (int): Number of top songs to return. Default is 10.
    weighted (bool): Whether to calculate weighted listen totals.

    Returns:
    pandas.DataFrame: DataFrame containing the top songs and their listen totals.
    """
    print("Calculating top songs by listen totals")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original

    weighted_listens = {}

    for index, row in df_copy.iterrows():
        track_name = row['trackName']
        percentage_listened = row['percentage_listened']
        
        # Skip if percentage_listened is NaN
        if pd.isna(percentage_listened):
            continue
        
        # Snap the percentage_listened to a maximum of 1
        percentage_listened = min(percentage_listened / 100, 1)
        
        if weighted:
            if track_name not in weighted_listens:
                weighted_listens[track_name] = 0
            weighted_listens[track_name] += percentage_listened
        else:
            if track_name not in weighted_listens:
                weighted_listens[track_name] = 0
            weighted_listens[track_name] += 1

    # Convert to a DataFrame for easy sorting and selection
    weighted_listens_df = pd.DataFrame.from_dict(weighted_listens, orient='index', columns=['listen_totals'])
    top_songs_listen_totals = weighted_listens_df.sort_values(by='listen_totals', ascending=False).head(top_n)
    
    return top_songs_listen_totals





<h3>Genre Analysis</h3>

In [30]:
def top_genres_per_month(df, distinguish_years=True):
    """
    Analyze top genres per month, with an option to distinguish between years.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per genre involved in each track,
    groups by month (and optionally year) and genre, and sums the listening time ('minutesPlayed') for each genre.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime', 'genres', and 'msPlayed' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    dict: Dictionary with months (and optionally years) as keys and lists of the top genres for each month.
    """
    print("analyzing top genres per month")
    df_copy = df.copy()
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_copy['genres'] = df_copy['genres'].replace({pd.NA: '[]'})
    
    def parse_genres(genres):
        try:
            return ast.literal_eval(genres)
        except (ValueError, SyntaxError):
            return []
    
    df_copy['genres'] = df_copy['genres'].apply(parse_genres)
    df_expanded = df_copy.explode('genres')
    
    df_expanded['minutesPlayed'] = df_expanded['msPlayed'] / (1000 * 60)
    
    top_genres_per_month = {}
    if distinguish_years:
        df_expanded['year_month'] = df_expanded['endTime'].dt.to_period('M')
        for period in df_expanded['year_month'].unique():
            monthly_df = df_expanded[df_expanded['year_month'] == period]
            top_genres = monthly_df.groupby('genres')['minutesPlayed'].sum().nlargest(5)
            top_genres_per_month[period] = top_genres.index.tolist()
        formatted_output = {str(period): genres for period, genres in top_genres_per_month.items()}
    else:
        df_expanded['month'] = df_expanded['endTime'].dt.month
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        for month in df_expanded['month'].unique():
            monthly_df = df_expanded[df_expanded['month'] == month]
            top_genres = monthly_df.groupby('genres')['minutesPlayed'].sum().nlargest(5)
            top_genres_per_month[month_names[month]] = top_genres.index.tolist()
        formatted_output = {month: genres for month, genres in top_genres_per_month.items()}
    
    return formatted_output


def top_genres_for_year(df, top_n=5):
    """
    Identify the top genres for the year.

    This function calculates the frequency of each genre and returns the top genres along with their total listening time.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'genres' and 'msPlayed' columns.
    top_n (int): Number of top genres to return. Default is 5.

    Returns:
    pandas.Series: Series containing the top genres and their total listening times.
    """
    print("analyzing top genres for the year")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['genres'] = df_copy['genres'].replace({pd.NA: '[]'})
    
    def parse_genres(genres):
        """
        Parse the genres column to ensure it contains valid genre data.

        Args:
            genres (str or list): The genre data to parse.

        Returns:
            list: A list of genres.
        """
        try:
            return ast.literal_eval(genres) if isinstance(genres, str) else genres
        except (ValueError, SyntaxError):
            return []
    
    df_copy['genres'] = df_copy['genres'].apply(parse_genres)
    df_expanded = df_copy.explode('genres')
    
    # Convert msPlayed to minutes
    df_expanded['minutesPlayed'] = df_expanded['msPlayed'] / (1000 * 60)
    
    # Calculate total listening time per genre and get the top genres
    genre_taste = df_expanded.groupby('genres')['minutesPlayed'].sum().nlargest(top_n)
    
    # Ensure the index (genre names) are strings
    genre_taste.index = genre_taste.index.astype(str)
    
    return genre_taste


def top_tracks_by_genre(df, genre, top_n=5):
    """
    Identify top tracks within a specific genre.

    This function filters the DataFrame by genre, sums the listening time ('minutesPlayed') for each track,
    and identifies the top tracks within the genre based on the listening time.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime', 'trackName', 'msPlayed', and 'genres' columns.
    genre (str): The genre to filter by.
    top_n (int): Number of top tracks to return for the specified genre. Default is 5.

    Returns:
    list of tuples: List of tuples containing the top tracks within the specified genre and their corresponding listening times in minutes.
    """
    print(f"Analyzing top tracks for genre: {genre}")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    
    def parse_genres(genres):
        """
        Parse the genres column to ensure it contains valid genre data.

        Args:
            genres (str or list): The genre data to parse.

        Returns:
            list: A list of genres.
        """
        if isinstance(genres, list):
            return genres
        if pd.isna(genres) or genres == '[]':
            return []
        try:
            # If genres is a string representation of a list
            return ast.literal_eval(genres)
        except (ValueError, SyntaxError):
            return []

    # Parse genres
    df_copy['genres'] = df_copy['genres'].apply(parse_genres)
    
    df_genre = df_copy[df_copy['genres'].apply(lambda x: genre in x)]
    
    # Convert msPlayed to minutes
    df_genre['minutesPlayed'] = df_genre['msPlayed'] / (1000 * 60)
    
    top_tracks = df_genre.groupby('trackName')['minutesPlayed'].sum().nlargest(top_n)
    
    return list(top_tracks.items())



def top_tracks_by_top_genres(df, top_genres, top_n=5):
    """
    Identify top tracks for each of the top genres.

    This function loops through the specified top genres, calls the top_tracks_by_genre function,
    and formats the top tracks for each genre.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data.
    top_genres (list): List of top genres to analyze.
    top_n (int): Number of top tracks to return for each genre. Default is 5.

    Returns:
    str: Formatted string containing the top tracks for each genre.
    """
    # Initialize a list to store formatted tracks by genre
    formatted_genre_tracks = []

    # Loop through the top genres and get the top tracks for each
    for genre in top_genres:
        top_tracks_genre = top_tracks_by_genre(df, genre, top_n)
        
        formatted_tracks = []
        for track, minutes in top_tracks_genre:
            formatted_tracks.append(f"{track} - {minutes:.2f} min.")
        formatted_tracks = "<br/>".join(formatted_tracks)
        formatted_genre_tracks.append(f"<b>Top Tracks in {genre.title()}:</b><br/>{formatted_tracks}")

    # Combine all the formatted tracks by genre into a single string
    formatted_genre_tracks = "<br/><br/>".join(formatted_genre_tracks)

    return formatted_genre_tracks


def genre_popularity_over_time(df):
    """
    Analyze genre popularity over time.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per genre involved in each track,
    groups by date and genre, and sums the listening time ('msPlayed') in minutes for each genre.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime', 'genres', and 'msPlayed' columns.

    Returns:
    pandas.DataFrame: DataFrame containing the summed listening time in minutes for each genre over time.
    """
    print("Analyzing genre popularity over time")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_copy['genres'] = df_copy['genres'].replace({pd.NA: '[]'})
    
    def parse_genres(genres):
        try:
            return literal_eval(genres)
        except (ValueError, SyntaxError):
            return []
    
    df_copy['genres'] = df_copy['genres'].apply(parse_genres)
    df_expanded = df_copy.explode('genres')
    
    # Convert msPlayed to minutes
    df_expanded['minutesPlayed'] = df_expanded['msPlayed'] / (1000 * 60)
    
    genre_popularity = df_expanded.groupby([df_expanded['endTime'].dt.date, 'genres'])['minutesPlayed'].sum().unstack().fillna(0)
    
    return genre_popularity


def summarize_genre_popularity(genre_popularity_df, top_n=5):
    """
    Summarize genre popularity over time.

    Parameters:
    genre_popularity_df (pandas.DataFrame): DataFrame containing the summed listening time in minutes for each genre over time.
    top_n (int): Number of top genres to summarize.

    Returns:
    str: Summary of key insights into genre popularity.
    """
    summary = []
    
    if genre_popularity_df.empty:
        return "No data available to summarize genre popularity."

    # Identify overall top genres
    total_listening = genre_popularity_df.sum()
    top_genres = total_listening.nlargest(top_n).index
    summary.append(f"Top {top_n} genres over the period: " + ", ".join(top_genres))
    
    # Ensure there are enough rows to access the first and last elements
    if len(genre_popularity_df) > 1:
        # Determine any significant changes in genre popularity
        most_recent = genre_popularity_df.iloc[-1][top_genres]
        initial = genre_popularity_df.iloc[0][top_genres]
        changes = (most_recent - initial) / initial * 100
        significant_changes = changes[abs(changes) > 50]
        
        if not significant_changes.empty:
            summary.append("Significant changes in genre popularity:")
            for genre, change in significant_changes.items():
                direction = "increased" if change > 0 else "decreased"
                summary.append(f"{genre.title()}: {direction} by {abs(change):.2f}%")
        else:
            summary.append("No significant changes in genre popularity.")
    else:
        summary.append("Insufficient data to determine significant changes in genre popularity.")
    
    return "<br/>".join(summary)



def genre_diversity_per_month(df, distinguish_years=True):
    """
    Calculate the diversity of user's music taste per month based on the number of distinct genres.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per genre involved in each track,
    groups by month (and optionally year) and counts the number of distinct genres listened to each month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' and 'genres' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    pandas.Series: Series containing the count of distinct genres per month.
    """
    print("Analyzing genre diversity per month")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_copy['genres'] = df_copy['genres'].replace({pd.NA: '[]'})
    
    def parse_genres(genres):
        """
        Parse the genres column to ensure it contains valid genre data.

        Args:
            genres (str or list): The genre data to parse.

        Returns:
            list: A list of genres.
        """
        try:
            return ast.literal_eval(genres)
        except (ValueError, SyntaxError):
            return []

    df_copy['genres'] = df_copy['genres'].apply(parse_genres)
    df_expanded = df_copy.explode('genres')
    
    if distinguish_years:
        df_expanded['year_month'] = df_expanded['endTime'].dt.to_period('M')
        genre_diversity = df_expanded.groupby('year_month')['genres'].nunique()
    else:
        df_expanded['month'] = df_expanded['endTime'].dt.month
        genre_diversity = df_expanded.groupby('month')['genres'].nunique()
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        genre_diversity.index = genre_diversity.index.map(month_names)
    
    return genre_diversity

def genre_diversity_growth(df, distinguish_years=True):
    """
    Calculate the growth in genre diversity over a period of time.

    This function converts the 'endTime' column to datetime, expands the DataFrame to have one row per genre involved in each track,
    groups by month (and optionally year), and calculates the percentage growth in the number of distinct genres listened to between each month.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the listening data with 'endTime' and 'genres' columns.
    distinguish_years (bool): Whether to distinguish between months of different years.

    Returns:
    pandas.Series: Series containing the percentage growth in distinct genres per month.
    """
    print("Calculating genre diversity growth")
    df_copy = df.copy()  # Copy the DataFrame to avoid modifying the original
    df_copy['endTime'] = pd.to_datetime(df_copy['endTime'])
    df_copy['genres'] = df_copy['genres'].replace({pd.NA: '[]'})

    def parse_genres(genres):
        """
        Parse the genres column to ensure it contains valid genre data.

        Args:
            genres (str or list): The genre data to parse.

        Returns:
            list: A list of genres.
        """
        try:
            return ast.literal_eval(genres)
        except (ValueError, SyntaxError):
            return []

    df_copy['genres'] = df_copy['genres'].apply(parse_genres)
    df_expanded = df_copy.explode('genres')

    if distinguish_years:
        df_expanded['year_month'] = df_expanded['endTime'].dt.to_period('M')
        genre_diversity = df_expanded.groupby('year_month')['genres'].nunique()
    else:
        df_expanded['month'] = df_expanded['endTime'].dt.month
        genre_diversity = df_expanded.groupby('month')['genres'].nunique()
        month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
        genre_diversity.index = genre_diversity.index.map(month_names)
    
    genre_diversity_growth = genre_diversity.pct_change().fillna(0) * 100

    return genre_diversity_growth


<h3>Visualization</h3>

In [15]:
def plot_genre_popularity_heatmap(genre_popularity, top_n=20):
    """
    Plot a heatmap of genre popularity over time.

    Parameters:
    genre_popularity (pandas.DataFrame): DataFrame containing the summed listening time in minutes for each genre over time.
    top_n (int): Number of top genres to display in the heatmap. Default is 20.
    """
    # Sum listening time for each genre and select top N genres
    top_genres = genre_popularity.sum().nlargest(top_n).index
    genre_popularity_top = genre_popularity[top_genres]

    plt.figure(figsize=(14, 10))
    sns.heatmap(genre_popularity_top.T, cmap="YlGnBu", cbar_kws={'label': 'Listening Time (minutes)'})
    plt.title('Top Genres Popularity Over Time')
    plt.xlabel('Date')
    plt.ylabel('Genre')
    plt.show()
    
def plot_top_genres_line(genre_popularity, top_n=5):
    """
    Plot a line chart of the top genres over time.

    Parameters:
    genre_popularity (pandas.DataFrame): DataFrame containing the summed listening time in minutes for each genre over time.
    top_n (int): Number of top genres to display in the line plot. Default is 5.
    """
    top_genres = genre_popularity.sum().nlargest(top_n).index
    plt.figure(figsize=(12, 8))
    for genre in top_genres:
        plt.plot(genre_popularity.index, genre_popularity[genre], label=genre)
    plt.title(f'Top {top_n} Genres Over Time')
    plt.xlabel('Date')
    plt.ylabel('Listening Time (minutes)')
    plt.legend(title='Genre')
    plt.show()


def genre_summary_statistics(genre_popularity):
    """
    Generate summary statistics for genre popularity.

    Parameters:
    genre_popularity (pandas.DataFrame): DataFrame containing the summed listening time in minutes for each genre over time.

    Returns:
    dict: Dictionary containing summary statistics for genre popularity.
    """
    total_listening_time = genre_popularity.sum()
    most_popular_genre = total_listening_time.idxmax()
    least_popular_genre = total_listening_time.idxmin()

    summary = {
        "Most Popular Genre": most_popular_genre,
        "Total Listening Time of Most Popular Genre (minutes)": total_listening_time[most_popular_genre],
        "Least Popular Genre": least_popular_genre,
        "Total Listening Time of Least Popular Genre (minutes)": total_listening_time[least_popular_genre]
    }

    print("summary statistics for genre popularity generated")
    return summary

In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import ast

def generate_report():
    """
    Function to create a button and text input for generating a PDF report based on user ID.
    This version includes the introduction paragraph and PDF generation logic.
    """
    def on_generate_button_click(b):
        user_id = user_id_input.value
        create_md_report(user_id)
        print(f"Report generated for user ID: {user_id}")

    # Create text input widget for user ID
    user_id_input = widgets.Text(description="User ID:")

    # Create a button widget
    generate_button = widgets.Button(description="Generate Report")

    # Link the button to the nested function
    generate_button.on_click(on_generate_button_click)

    # Display the input field and button
    display(user_id_input, generate_button)



def create_md_report(user_id='ezra'):
    """
    Creates a Markdown report for the given user ID with an introduction paragraph and analysis results.
    
    Parameters:
    user_id (str): The user ID for which to generate the report.

    Returns:
    None
    """
    # Set up the Markdown document
    file_name = f"{user_id}_spotify_report.md"
    document_title = "# Spotify Re-Wrapped 2024"
    intro_text = (
        "Spotify Wrapped for 2024 didn’t quite hit the mark, so I decided to take matters into my own hands. "
        "This project dives into my actual listening data to get a better picture of my music tastes. "
        "By analyzing various aspects of my Spotify history, I can uncover patterns, preferences, and trends that "
        "Spotify's summary might have missed. From listening times and favorite artists to genre distributions, "
        "this project aims to create a more accurate and personalized Spotify Re-Wrapped experience."
    )

    # Read the user's listening data from a CSV file
    file_path = f"{user_id}_listening_data.csv"
    df = pd.read_csv(file_path, encoding='utf-8')

    # Expand and standardize artists involved
    df = expand_artists_involved(df)

    df_og = df.copy()

    ######## Perform analysis  ##########
    #general
    total_time_hours = total_listening_time_per_user(df)
    biggest_date, total_minutes_on_biggest_date = biggest_listening_date(df)
    unique_counts = calculate_unique_counts(df)
    common_days = common_listening_days(df)
    monthly_patterns = monthly_listening_patterns(df)
    general_span = general_attention_span(df)
    listening_ranges = listening_percentage_categories(df)

    
    #artist
    top_artists_time = top_artists_by_time(df)
    top_artists_count = top_artists_by_count(df)
    top_artists_weighted_time = top_artists_by_weighted_time(df)
    top_artists_weighted_count = top_artists_by_weighted_count(df)
    top_weighted_artists_month = top_weighted_artists_per_month(df)
    # Identify top collaborating artists
   # Identify top collaborating artists
    top_collaborating_artists_df = top_collaborating_artists(df, top_n=10)
    formatted_top_collaborating_artists = "\n".join([f"- **{pair[0]} & {pair[1]}**: {count} collaborations" for pair, count in top_collaborating_artists_df[['artist_pair', 'collaboration_count']].values])

    artist_span, artist_listened_time = artist_attention_span(df)

    # # Normalize the index of artist_span and top_artists_weighted_time
    # artist_span.index = artist_span.index.str.lower()

    
    songs_by_plays_month = top_songs_by_plays_per_month(df)
    weighted_songs_month = top_songs_by_weighted_time_per_month(df)

    artist_diversity_growth_result = artist_diversity_growth(df)
    # Calculate genre diversity per month and growth
    genre_diversity_per_month_result = genre_diversity_per_month(df, distinguish_years=True)
    genre_diversity_growth_result = genre_diversity_growth(df, distinguish_years=True)

    #genre
    # Call the function to get the top genres for the year
    top_genres_year = top_genres_for_year(df)

    # Check the type of the returned value
    top_genres_month = top_genres_per_month(df, distinguish_years=True)
    top_genres_year_list = top_genres_year.index.tolist()
    top_tracks_by_genres = top_tracks_by_top_genres(df, top_genres_year_list)

    # Calculate top songs
    top_songs_unweighted_time = top_songs_by_listening_time(df, top_n=10, weighted=False)
    top_songs_weighted_time = top_songs_by_listening_time(df, top_n=10, weighted=True)
    top_songs_unweighted_totals = top_songs_by_listen_totals(df, top_n=10, weighted=False)
    top_songs_weighted_totals = top_songs_by_listen_totals(df, top_n=10, weighted=True)

    
    ######## Format the messages for Markdown ########
    def format_artist_count(artist_counts):
        formatted_counts = []
        if isinstance(artist_counts, pd.Series):
            for artist, count in artist_counts.items():
                formatted_counts.append(f"- **{artist}**: {float(count):.2f} weighted plays")
        elif isinstance(artist_counts, pd.DataFrame):
            for index, row in artist_counts.iterrows():
                artist = index
                count = row.iloc[0]
                formatted_counts.append(f"- **{artist}**: {float(count):.2f} weighted plays")
        return formatted_counts

    
    def format_artist_time(artist_times):
        formatted_times = []
        if isinstance(artist_times, pd.Series):
            for artist, time in artist_times.items():
                minutes = time / (1000 * 60)  # Convert from milliseconds to minutes
                hours = time / (1000 * 60 * 60)  # Convert from milliseconds to hours
                formatted_times.append(f"- **{artist}**: {minutes:.2f} min. ({hours:.2f} hr.)")
        elif isinstance(artist_times, pd.DataFrame):
            for index, row in artist_times.iterrows():
                artist = index
                time = row.iloc[0]
                minutes = time / (1000 * 60)
                hours = time / (1000 * 60 * 60)
                formatted_times.append(f"- **{artist}**: {minutes:.2f} min. ({hours:.2f} hr.)")
        return formatted_times




    total_listening_time_message = f"You listened to **{total_time_hours[user_id]:.2f}** hours of music this year."
    top_day_message = f"You listened to the most music on **{biggest_date.strftime('%B %d, %Y')}**! A whole **{total_minutes_on_biggest_date:.2f}** minutes of music!"
    unique_counts_msg = f"You listened to **{unique_counts['unique_songs']} unique songs** this year. That's music from **{unique_counts['unique_artists']} different artists** and on **{unique_counts['unique_albums']} different albums**! Way to go!"
    
    formatted_artists_time = "\n".join(format_artist_time(top_artists_time))
    formatted_artists_count = "\n".join([f"- **{artist}**: {count} plays" for artist, count in top_artists_count.items()])
    formatted_artists_time_w = "\n".join(format_artist_time(top_artists_weighted_time))
    formatted_artists_count_w = "\n".join(format_artist_count(top_artists_weighted_count))

    formatted_genres_year = "\n".join([f"- **{genre.title()}**" for genre in top_genres_year.index])
    formatted_top_genres_month = "\n".join([f"**{period}**: " + ", ".join(genres) for period, genres in top_genres_month.items()])
    
    formatted_weighted_artists_month = "\n".join([f"**{month}**: " + ", ".join(artists) for month, artists in top_weighted_artists_month.items()])
    formatted_artist_diversity_growth = "\n".join([f"**{month}**: {growth:.2f}% growth" for month, growth in artist_diversity_growth_result.items()])
    formatted_common_days = "\n".join([f"**{day}**: {count} times" for day, count in common_days.items()])

    formatted_songs_by_plays_month = "\n".join([f"**{month}**: " + ", ".join(songs) for month, songs in songs_by_plays_month.items()])
    formatted_weighted_songs_month = "\n".join([f"**{month}**: " + ", ".join(songs) for month, songs in weighted_songs_month.items()])

    formatted_monthly_patterns = "\n".join([f"{month}: {duration:.2f} minutes" for month, duration in monthly_patterns.items()])

    # Formatting top songs
    formatted_top_songs_unweighted_time = "\n".join([f"- **{song}**: {time['listening_time'] / (1000 * 60):.2f} min." for song, time in top_songs_unweighted_time.iterrows()])
    formatted_top_songs_weighted_time = "\n".join([f"- **{song}**: {time['listening_time'] / (1000 * 60):.2f} min." for song, time in top_songs_weighted_time.iterrows()])
    formatted_top_songs_unweighted_totals = "\n".join([f"- **{song}**: {count['listen_totals']} plays" for song, count in top_songs_unweighted_totals.iterrows()])
    formatted_top_songs_weighted_totals = "\n".join([f"- **{song}**: {count['listen_totals']:.2f} weighted plays" for song, count in top_songs_weighted_totals.iterrows()])

        
    # Filter the artist_span to only include these top artists and handle any missing artists
    top_artists = top_artists_weighted_time.index
    filtered_artist_span = {artist: artist_span.get(artist, None) for artist in top_artists}

    formatted_artist_attention_span = []
    for artist, span in filtered_artist_span.items():
        if span is not None:
            if isinstance(span, pd.Series):
                span = span.iloc[0]
            formatted_artist_attention_span.append(f"- **{artist.title()}**: {span:.2f}% average attention span")
        else:
            formatted_artist_attention_span.append(f"- **{artist.title()}**: Data not available")
    formatted_artist_attention_span = "\n".join(formatted_artist_attention_span)

    # Formatting genre diversity per month and growth
    formatted_genre_diversity_per_month = "\n".join([f"**{month}**: {diversity} genres" for month, diversity in genre_diversity_per_month_result.items()])
    formatted_genre_diversity_growth = "\n".join([f"**{month}**: {growth:.2f}% growth" for month, growth in genre_diversity_growth_result.items()])


    general_attention_message = f"The average percentage of tracks listened to before skipping is **{general_span:.2f}%**."

    formatted_listening_ranges = "\n".join([f"**{range_label}**: {count} times" for range_label, count in listening_ranges.items()])

    top_genres_month = top_genres_per_month(df_og, distinguish_years=True)
    formatted_top_genres_month = "\n".join([f"**{period}**: " + ", ".join(genres) for period, genres in top_genres_month.items()])


    ########## Write the Markdown content to file ##########
    with open(file_name, 'w', encoding='utf-8') as md_file:
        # Introduction
        md_file.write(f"{document_title}\n\n{intro_text}\n\n")

        # General Section
        md_file.write("## General Listening Data\n")
        md_file.write(f"{total_listening_time_message}\n\n{top_day_message}\n\n{unique_counts_msg}\n\n")
        md_file.write(f"### Most Common Listening Days\n{formatted_common_days}\n\n")
        md_file.write(f"### Monthly Listening Patterns\n{formatted_monthly_patterns}\n\n")
        md_file.write(f"### General Attention Span\n{general_attention_message}\n\n")
        md_file.write(f"### Listening Percentage Categories\n{formatted_listening_ranges}\n\n")

        # Artists Section
        md_file.write("## Artist Data\n")
        md_file.write(f"### Top Artists by Listening Time\n{formatted_artists_time}\n\n")
        md_file.write(f"### Top Artists by Weighted Listening Time\n{formatted_artists_time_w}\n\n")
        md_file.write(f"### Top Artists by Count\n{formatted_artists_count}\n\n")
        md_file.write(f"### Top Artists by Weighted Count\n{formatted_artists_count_w}\n\n")
        md_file.write(f"### Top Weighted Artists per Month\n{formatted_weighted_artists_month}\n\n")
        md_file.write(f"### Artist Diversity Growth\n{formatted_artist_diversity_growth}\n\n")
        md_file.write(f"### Artist Attention Span\n{formatted_artist_attention_span}\n\n")
        md_file.write(f"### Top Collaborating Artists\n{formatted_top_collaborating_artists}\n\n")

        # Songs Section
        md_file.write("## Song Data\n")
        md_file.write(f"### Top Songs by Listening Time\n")
        md_file.write(f"#### Unweighted Listening Time\n{formatted_top_songs_unweighted_time}\n\n")
        md_file.write(f"#### Weighted Listening Time\n{formatted_top_songs_weighted_time}\n\n")
        md_file.write(f"### Top Songs by Listen Totals\n")
        md_file.write(f"#### Unweighted Listen Totals\n{formatted_top_songs_unweighted_totals}\n\n")
        md_file.write(f"#### Weighted Listen Totals\n{formatted_top_songs_weighted_totals}\n\n")
        md_file.write(f"### Top Songs by Unweighted Plays per Month\n{formatted_songs_by_plays_month}\n\n")
        md_file.write(f"### Top Songs by Weighted Listening Time per Month\n{formatted_weighted_songs_month}\n\n")

        # Genres Section
        md_file.write("## Genre Data\n")
        md_file.write(f"### Top Genres for the Year\n{formatted_genres_year}\n\n")
        md_file.write(f"### Top Genres Per Month\n{formatted_top_genres_month}\n\n")
        md_file.write(f"### Top Tracks by Top Genres\n{top_tracks_by_genres}\n\n")

        # Genre Diversity Section
        md_file.write("## Genre Diversity Data\n")
        md_file.write(f"### Genre Diversity Per Month\n{formatted_genre_diversity_per_month}\n\n")
        md_file.write(f"### Genre Diversity Growth\n{formatted_genre_diversity_growth}\n\n")

        
    print(f"Markdown report created: {file_name}")

# Call the function to generate the Markdown report
#create_md_report(user_id="ezra")

generate_report()



In [None]:
import pandas as pd
# Import necessary libraries
import ipywidgets as widgets
from IPython.display import display
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Frame, PageBreak
import matplotlib.pyplot as plt
import seaborn as sns

from reportlab.platypus import Paragraph, Image, SimpleDocTemplate, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
import os
from reportlab.pdfbase.ttfonts import TTFont 
from reportlab.pdfbase import pdfmetrics



def generate_report():
    """
    Function to create a button and text input for generating a PDF report based on user ID.
    This version includes the introduction paragraph and PDF generation logic.
    """
    def on_generate_button_click(b):
        user_id = user_id_input.value
        create_pdf_report(user_id)
        print(f"Report generated for user ID: {user_id}")

    # Create text input widget for user ID
    user_id_input = widgets.Text(description="User ID:")

    # Create a button widget
    generate_button = widgets.Button(description="Generate Report")

    # Link the button to the nested function
    generate_button.on_click(on_generate_button_click)

    # Display the input field and button
    display(user_id_input, generate_button)

def create_pdf_report(user_id='ezra'):
    """
    Creates a PDF report for the given user ID with an introduction paragraph and analysis results.
    
    Parameters:
    user_id (str): The user ID for which to generate the report.

    Returns:
    None
    """
    def save_partial_pdf(doc, elements, part):
        doc.build(elements)
        elements.clear()
        elements.append(PageBreak())
        print(f"Part {part} of the PDF saved.")

    # Set up the PDF document
    file_name = f"{user_id}_spotify_report.pdf"
    document_title = "Spotify Re-Wrapped 2024"
    intro_text = (
        "Spotify Wrapped for 2024 didn’t quite hit the mark, so I decided to take matters into my own hands. "
        "This project dives into my actual listening data to get a better picture of my music tastes. "
        "By analyzing various aspects of my Spotify history, I can uncover patterns, preferences, and trends that "
        "Spotify's summary might have missed. From listening times and favorite artists to genre distributions, "
        "this project aims to create a more accurate and personalized Spotify Re-Wrapped experience."
    )

    # Read the user's listening data from a CSV file
    file_path = f"{user_id}_listening_data.csv"
    df = pd.read_csv(file_path, encoding='utf-8')

    # Expand and standardize artists involved
    df = expand_artists_involved(df)

    ######## Perform analysis  ##########
    #general
    total_time_hours = total_listening_time_per_user(df)
    biggest_date, total_minutes_on_biggest_date = biggest_listening_date(df)
    unique_counts = calculate_unique_counts(df)
    common_days = common_listening_days(df)
    monthly_patterns = monthly_listening_patterns(df)
    general_span = general_attention_span(df)
    listening_ranges = listening_percentage_categories(df)

    
    #artist
    top_artists_time = top_artists_by_time(df)
    top_artists_count = top_artists_by_count(df)
    top_artists_weighted_time = top_artists_by_weighted_time(df)
    top_artists_weighted_count = top_artists_by_weighted_count(df)
    top_weighted_artists_month = top_weighted_artists_per_month(df)
    artist_span, artist_listened_time = artist_attention_span(df)

    # Normalize the index of artist_span and top_artists_weighted_time
    artist_span.index = artist_span.index.str.lower()
    top_artists_weighted_time.index = top_artists_weighted_time.index.str.lower()

    
    songs_by_plays_month = top_songs_by_plays_per_month(df)
    weighted_songs_month = top_songs_by_weighted_time_per_month(df)


    artist_diversity_growth_result = artist_diversity_growth(df)
    

    #genre
    top_genres_year = top_genres_for_year(df)
    # top_genres_month = top_genres_per_month(df, distinguish_years=True)


    ######## Create the PDF document using SimpleDocTemplate
    doc = SimpleDocTemplate(file_name, pagesize=letter)
    
    # Define styles for the document
    styles = getSampleStyleSheet()
    header_style = ParagraphStyle(name='Header', fontName='Helvetica-Bold', fontSize=18, leading=22)
    body_style = ParagraphStyle(name='Body', fontName='Helvetica', fontSize=12, leading=14)
    # Create the header and introduction paragraph
    header = Paragraph(document_title, header_style)
    introduction = Paragraph(intro_text, body_style)

    ########### Format the message 
    total_listening_time_message = f"You listened to <b>{total_time_hours[user_id]:.2f}</b> hours of music this year."
    top_day_message = f"You listened to the most music on <b>{biggest_date.strftime('%B %d, %Y')}</b>! A whole <b>{total_minutes_on_biggest_date:.2f}</b> minutes of music!"
    unique_counts_msg = f"You listened to <b>{unique_counts['unique_songs']} unique songs</b> this year. That's music from <b>{unique_counts['unique_artists']} different artists</b> and on <b>{unique_counts['unique_albums']} different albums</b>! Way to go!"
    
    formatted_artists_time = [] 
    for artist, time in top_artists_time.items(): 
        minutes = time / 60 
        hours = time / 3600 
        formatted_artists_time.append(f"{artist.title()} - {minutes:.2f} min. ({hours:.2f} hr.)")
    formatted_artists_time = "<br/>".join(formatted_artists_time)

    formatted_artists_count = [] 
    for artist, count in top_artists_count.items(): 
        formatted_artists_count.append(f"{artist.title()} - {count} plays.")
    formatted_artists_count = "<br/>".join(formatted_artists_count)


    formatted_artists_time_w = []
    if isinstance(top_artists_weighted_time, pd.DataFrame):
        for index, row in top_artists_weighted_time.iterrows():
            artist = index
            time = row['weighted_time']
            minutes = time / (1000 * 60)  # Convert from milliseconds to minutes
            hours = time / (1000 * 60 * 60)  # Convert from milliseconds to hours
            formatted_artists_time_w.append(f"{artist.title()} - {minutes:.2f} weighted min. ({hours:.2f} weighted hr.)")
    else:
        for artist, time in top_artists_weighted_time.items():
            minutes = time / (1000 * 60)  # Convert from milliseconds to minutes
            hours = time / (1000 * 60 * 60)  # Convert from milliseconds to hours
            formatted_artists_time_w.append(f"{artist.title()} - {minutes:.2f} weighted min. ({hours:.2f} weighted hr.)")
    formatted_artists_time_w = "<br/>".join(formatted_artists_time_w)

    formatted_artists_count_w = []
    if isinstance(top_artists_weighted_count, pd.DataFrame):
        for index, row in top_artists_weighted_count.iterrows():
            artist = index
            count = row['weighted_listens']
            formatted_artists_count_w.append(f"{artist.title()} - {count:.2f} weighted plays")
    else:
        for artist, count in top_artists_weighted_count.items():
            formatted_artists_count_w.append(f"{artist.title()} - {count:.2f} weighted plays")
    formatted_artists_count_w = "<br/>".join(formatted_artists_count_w)


    formatted_genres_year = []
    for genre in top_genres_year:
        formatted_genres_year.append(f"- {genre.title()}")
    formatted_genres_year = "<br/>".join(formatted_genres_year)

    # Initialize a list to store formatted tracks by genre
    formatted_genre_tracks = []

    # Loop through the top 5 genres and get the top tracks for each
    for genre in top_genres_year[:5]:
        top_tracks_genre = top_tracks_by_genre(df, genre)
        
        formatted_tracks = []
        for track, minutes in top_tracks_genre:
            formatted_tracks.append(f"{track} - {minutes:.2f} min.")
        formatted_tracks = "<br/>".join(formatted_tracks)
        formatted_genre_tracks.append(f"<b>Top Tracks in {genre.title()}:</b><br/>{formatted_tracks}")

    # Combine all the formatted tracks by genre into a single string
    formatted_genre_tracks = "<br/><br/>".join(formatted_genre_tracks)

    formatted_weighted_artists_month = []
    for month, artists in top_weighted_artists_month.items():
        formatted_weighted_artists_month.append(f"<b>{month}:</b> " + ", ".join(artists))
    formatted_weighted_artists_month = "<br/>".join(formatted_weighted_artists_month)

    formatted_artist_diversity_growth = []
    for month, growth in artist_diversity_growth_result.items():
        formatted_artist_diversity_growth.append(f"<b>{month}:</b> {growth:.2f}% growth")
    formatted_artist_diversity_growth = "<br/>".join(formatted_artist_diversity_growth)

    formatted_common_days = []
    for day, count in common_days.items():
        formatted_common_days.append(f"{day}: {count} times")
    formatted_common_days = "<br/>".join(formatted_common_days)


    formatted_songs_by_plays_month = []
    for month, songs in songs_by_plays_month.items():
        formatted_songs_by_plays_month.append(f"<b>{month}:</b> " + ", ".join(songs))
    formatted_songs_by_plays_month = "<br/>".join(formatted_songs_by_plays_month)
    formatted_weighted_songs_month = []
    for month, songs in weighted_songs_month.items():
        formatted_weighted_songs_month.append(f"<b>{month}:</b> " + ", ".join(songs))
    formatted_weighted_songs_month = "<br/>".join(formatted_weighted_songs_month)


    formatted_monthly_patterns = []
    for month, duration in monthly_patterns.items():
        formatted_monthly_patterns.append(f"{month}: {duration:.2f} minutes")
    formatted_monthly_patterns = "<br/>".join(formatted_monthly_patterns)

    # Get the list of top artists by weighted listening time
    top_artists = top_artists_weighted_time.index

    # Filter the artist_span to only include these top artists and handle any missing artists
    filtered_artist_span = {artist: artist_span.get(artist, None) for artist in top_artists}

    # Format the results
    formatted_artist_attention_span = []
    formatted_artist_attention_span.append("<b>Attention Span for Top Artists by Weighted Listening Time:</b>")
    for artist, span in filtered_artist_span.items():
        if span is not None:
            if isinstance(span, pd.Series):
                span = span.iloc[0]
            formatted_artist_attention_span.append(f"{artist.title()}: {span:.2f}% average attention span")
        else:
            formatted_artist_attention_span.append(f"{artist.title()}: Data not available")
    formatted_artist_attention_span = "<br/>".join(formatted_artist_attention_span)


    general_attention_message = f"The average percentage of tracks listened to before skipping is <b>{general_span:.2f}%</b>."

    formatted_listening_ranges = []
    for range_label, count in listening_ranges.items():
        formatted_listening_ranges.append(f"{range_label}: {count} times")
    formatted_listening_ranges_message = "<br/>".join(formatted_listening_ranges)

    # formatted_top_genres_month = []
    # for period, genres in top_genres_month.items():
    #     formatted_top_genres_month.append(f"<b>{period}:</b> " + ", ".join(genres))
    # formatted_top_genres_month = "<br/>".join(formatted_top_genres_month)

    

    ########## Assemble the elements
    elements = [header, introduction, Spacer(1, 12)]
    part = 1

    ######### Add analysis results to the PDF in sections
    analysis_results = [
        #general
        ("<b>Total Listening Time per User (in hours):</b>", total_listening_time_message),
        ("<b>Biggest Listening Date and Total Minutes Listened:</b>", top_day_message),
        ("<b>Number of Unique Songs, Artists, and Albums:</b>", unique_counts_msg),
        ("<b>Most Common Listening Days:</b>", formatted_common_days),
        ("<b>Monthly Listening Patterns:</b>", formatted_monthly_patterns),
        ("<b>General Attention Span:</b>", general_attention_message),
        ("<b>Listening Percentage Categories:</b>", formatted_listening_ranges_message),

        
        #artists
        ("<b>Top Artists by Listening Time:</b>", formatted_artists_time),
        ("<b>Top Artists by Count:</b>", formatted_artists_count),
        ("<b>Top Artists by Weighted Listening Time:</b>", formatted_artists_time_w),
        ("<b>Top Artists by Weighted Count:</b>", formatted_artists_count_w),
        ("<b>Top Weighted Artists per Month:</b>", formatted_weighted_artists_month),
        ("<b>Artist Diversity Growth:</b>", formatted_artist_diversity_growth),
        ("<b>Artist Attention Span:</b>", formatted_artist_attention_span),


        ("<b>Top Songs by Unweighted Plays per Month:</b>", formatted_songs_by_plays_month),
        ("<b>Top Songs by Weighted Listening Time per Month:</b>", formatted_weighted_songs_month),


        ("<b>Top Genres for the Year:</b>", formatted_genres_year),
        #("<b>Top Genres Per Month:</b>", formatted_top_genres_month),
        ("<b>Top Tracks by Genre:</b>", formatted_genre_tracks),


    ]

    for idx, (title, content) in enumerate(analysis_results):
        elements.append(Paragraph("<br/>" + title, body_style))
        elements.append(Paragraph(content, body_style))
        
    save_partial_pdf(doc, elements, part)



def get_unique_genres(df):
    """
    Get unique genres from the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing the listening data.

    Returns:
        set: Set containing unique genres.
    """
    unique_genres = set()
    for genres_list in df['genres']:
        for genre in genres_list:
            unique_genres.add(genre)
    return unique_genres


# Call the function to display the input fields and button for generating a report
generate_report()

#create_pdf_report(user_id="ezra")

In [None]:
from reportlab.platypus import Paragraph, Image, SimpleDocTemplate, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
import os

def ensure_album_art(artist_name, folder='albums'):
    """
    Ensure album art for a given artist exists, either by checking locally or fetching from Spotify API.

    Parameters:
    artist_name (str): The name of the artist.
    folder (str): The folder to check and save album art. Defaults to 'albums'.
    """
    sanitized_artist_name = artist_name.replace(' ', '_').lower()
    if not check_album_art_exists(sanitized_artist_name, folder):
        token = get_spotify_token()
        fetch_album_art(artist_name, token, folder)

def check_album_art_exists(artist_name, folder='albums'):
    """
    Check if the album art for a given artist exists in the specified folder.

    Parameters:
    artist_name (str): The name of the artist.
    folder (str): The folder to check for album art. Defaults to 'albums'.

    Returns:
    bool: True if album art exists, False otherwise.
    """
    filename = f"{folder}/{artist_name}.jpg"
    return os.path.isfile(filename)

def generate_artist_elements_with_images(top_artists_time):
    """
    Generate a list of reportlab Flowable elements with album art and listening time for each artist.

    Parameters:
    top_artists_time (pandas.Series): A Series where the index contains artist names and the values contain the listening times.

    Returns:
    list: List of Flowable elements with album art and listening times for each artist.
    """
    elements = []
    styles = getSampleStyleSheet()
    style = styles['Normal']
    
    for artist, time in top_artists_time.items():
        sanitized_artist_name = artist.replace(' ', '_').lower()
        ensure_album_art(artist)
        album_art_path = f"albums/{sanitized_artist_name}.jpg"
        if os.path.isfile(album_art_path):
            img = Image(album_art_path, width=50, height=50)
        else:
            img = Image('placeholder.jpg', width=50, height=50)
        
        artist_text = f"{artist}: {time:.2f} seconds"
        elements.append(img)
        elements.append(Paragraph(artist_text, style))
        elements.append(Spacer(1, 12))  # Add some space between entries
    
    return elements

# Assuming 'df' is your DataFrame with Spotify listening data
top_artists_time = top_artists_by_time(df)
artist_elements = generate_artist_elements_with_images(top_artists_time)

# Example usage in a SimpleDocTemplate
doc = SimpleDocTemplate("example_report.pdf", pagesize=letter)
doc.build(artist_elements)


<strong><h1>Main Functions</h1></strong>

<strong>Process and Track Songs:</strong> Function to process raw listening data, add songs to the unique_songs file, and save the new CSV.

In [None]:
# Import necessary libraries
import ipywidgets as widgets
from IPython.display import display

def process_and_track_songs(base_path='../wrapped_files/', unique_songs_file='unique_songs.csv'):
    """
    Process the raw listening data, track unique songs, and save to CSV.

    This function processes the raw listening data by reading and combining multiple chunks,
    adds all unique songs into the unique_songs file, and saves the processed data to a CSV file.

    Parameters:
    base_path (str, optional): The base path to the directory containing the raw listening data files. 
                               Defaults to '../wrapped_files/'.
    unique_songs_file (str, optional): The file path to the CSV file where unique songs are stored. 
                                       Defaults to 'unique_songs.csv'.

    Returns:
    None
    """
    def on_process_button_click(b):
        user_id = user_id_input.value
        num_chunks = num_chunks_input.value

        try:
            df = read_and_process_data(user_id, num_chunks, base_path)
            export_to_csv(df, user_id)
            track_unique_songs(df, unique_songs_file)

            print("Data processing complete!")
        except ValueError as e:
            print(e)

    # Create text input widgets for user ID and number of chunks
    user_id_input = widgets.Text(description="User ID:")
    num_chunks_input = widgets.IntText(description="Num Chunks:")

    # Create a button widget
    process_button = widgets.Button(description="Process and Export Data")

    # Link the button to the nested function
    process_button.on_click(on_process_button_click)

    # Display the input fields and button
    display(user_id_input, num_chunks_input, process_button)

# Call the function to display the widgets and set up the processing
process_and_track_songs()


In [None]:
# Import necessary libraries
import ipywidgets as widgets
from IPython.display import display

def update_unique_songs_data(unique_songs_file='unique_songs.csv'):
    """
    Update the unique songs table with Spotify info.

    This function updates the unique songs table with information from Spotify.

    Parameters:
    unique_songs_file (str, optional): The file path to the CSV file where unique songs are stored. 
                                       Defaults to 'unique_songs.csv'.

    Returns:
    None
    """
    def on_update_button_click(b):
        update_unique_songs(unique_songs_file)
        print("Unique songs table updated with Spotify info.")

    # Create a button widget for updating unique songs
    update_button = widgets.Button(description="Update Unique Songs")

    # Link the button to the nested function
    update_button.on_click(on_update_button_click)

    # Display the button
    display(update_button)

# Call the function to set up the button for updating unique songs
update_unique_songs_data()


In [None]:
import pandas as pd

# Load the unique songs database
unique_songs_file = 'unique_songs.csv'
unique_songs = pd.read_csv(unique_songs_file)

# Sort the database by artistName
sorted_unique_songs = unique_songs.sort_values(by='artistName')

# Save the sorted database to a new CSV file
sorted_unique_songs_file = 'sorted_unique_songs.csv'
sorted_unique_songs.to_csv(sorted_unique_songs_file, index=False)

print(f"Sorted unique songs database saved to {sorted_unique_songs_file}.")

In [None]:
# Import necessary libraries
import ipywidgets as widgets
from IPython.display import display
import pandas as pd

def process_filled_listening_data(unique_songs_file='unique_songs.csv'):
    """
    Load unique songs data, get user ID, read processed listening data,
    fill in song info, calculate percentage listened, remove empty genre column,
    and export to CSV.

    This function loads unique songs data, gets the user ID, reads the processed listening data,
    fills in song info from the unique songs database, calculates the percentage listened for each track,
    checks and removes the empty 'genre' column, and exports the filled data to a new CSV file.

    Parameters:
    unique_songs_file (str, optional): The file path to the CSV file where unique songs are stored. 
                                       Defaults to 'unique_songs.csv'.

    Returns:
    None
    """
    def on_process_button_click(b):
        unique_songs = pd.read_csv(unique_songs_file)

        user_id = user_id_input.value
        try:
            listening_data = read_processed_data(user_id)
            
            filled_listening_data = fill_song_info(listening_data, unique_songs)

            # Check if 'duration_ms' column is present
            if 'duration_ms' not in filled_listening_data.columns:
                print("Warning: 'duration_ms' column is missing in filled_listening_data.")
                return
            
            # Calculate percentage listened
            filled_listening_data['percentage_listened'] = (filled_listening_data['msPlayed'] / filled_listening_data['duration_ms']) * 100
            
            # Check and remove the empty 'genre' column if it exists and is empty
            if 'genre' in filled_listening_data.columns and filled_listening_data['genre'].isnull().all():
                filled_listening_data = filled_listening_data.drop(columns=['genre'])
            
            export_filled_data(filled_listening_data, user_id)

            print("Data processing complete!")
        except FileNotFoundError:
            print(f"Processed data file not found for user ID: {user_id}")

    # Create text input widgets for user ID
    user_id_input = widgets.Text(description="User ID:")

    # Create a button widget
    process_button = widgets.Button(description="Process Filled Listening Data")

    # Link the button to the nested function
    process_button.on_click(on_process_button_click)

    # Display the input fields and button
    display(user_id_input, process_button)

# Call the function to display the input fields and button for processing filled listening data
process_filled_listening_data()




**Color Generation Functions**


In [40]:
# Function to generate a random color
def generate_random_color():
    color = (random.randint(100, 255), random.randint(100, 255), random.randint(100, 255))
    print(f"Generated random color: {color}")
    return color

# Function to generate a color close to a given color
def generate_similar_color(color, variance=50):
    r = min(max(color[0] + random.randint(-variance, variance), 0), 255)
    g = min(max(color[1] + random.randint(-variance, variance), 0), 255)
    b = min(max(color[2] + random.randint(-variance, variance), 0), 255)
    similar_color = (r, g, b)
    print(f"Generated color similar to {color}: {similar_color}")
    return similar_color



**Abstract Background Generation**


In [41]:
# Function to generate an abstract background with dynamic colors
def generate_abstract_background(width=1080, height=1920):
    print(f"Generating abstract background of size {width}x{height}")
    start_color = generate_random_color()
    end_color = generate_similar_color(start_color)
    
    # Create a gradient based on the generated colors
    gradient = np.linspace(start_color, end_color, width).astype(int)
    gradient_cmap = plt.cm.colors.ListedColormap(gradient / 255.0)

    x = np.linspace(-5, 5, width)
    y = np.linspace(-5, 5, height)
    X, Y = np.meshgrid(x, y)
    
    Z = np.sin(X**2 + Y**2) * np.cos(Y**2 - X**2)
    
    plt.figure(figsize=(width / 100, height / 100), dpi=100)
    plt.imshow(Z, cmap=gradient_cmap, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('abstract_background.png', bbox_inches='tight', pad_inches=0)
    plt.close()

    background = Image.open('abstract_background.png')
    background = background.resize((width, height))
    print("Abstract background generated and saved as 'abstract_background.png'")
    return background

# Function to generate Perlin noise
def generate_perlin_noise(width, height, scale=100, seed=random.randint(0,500)):
    print(f"Generating Perlin noise of size {width}x{height} with scale {scale} and seed {seed}")
    shape = (width, height)
    world = np.zeros(shape)
    for i in range(shape[0]):
        for j in range(shape[1]):
            world[i][j] = noise.pnoise2(i / scale, j / scale, octaves=6, persistence=0.5, lacunarity=2.0, repeatx=1024, repeaty=1024, base=seed)
    
    norm_world = (world - np.min(world)) / (np.max(world) - np.min(world))
    print("Perlin noise generated")
    return norm_world

# Function to generate an abstract background with Perlin noise
def generate_abstract_background_with_noise(width=1080, height=1920):
    print(f"Generating abstract background with Perlin noise of size {width}x{height}")
    noise_pattern = generate_perlin_noise(width, height)
    
    start_color = generate_random_color()
    end_color = generate_similar_color(start_color)
    gradient = np.linspace(start_color, end_color, width).astype(int)
    gradient_cmap = plt.cm.colors.ListedColormap(gradient / 255.0)

    plt.figure(figsize=(width / 100, height / 100), dpi=100)
    plt.imshow(noise_pattern, cmap=gradient_cmap, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('abstract_background_with_noise.png', bbox_inches='tight', pad_inches=0)
    plt.close()

    background = Image.open('abstract_background_with_noise.png')
    background = background.resize((width, height))
    print("Abstract background with Perlin noise generated and saved as 'abstract_background_with_noise.png'")
    return background



**Text Drawing Function**


In [42]:
# Function to draw wrapped text
def draw_wrapped_text(draw, text, position, font, max_width, fill):
    print(f"Drawing wrapped text: {text}")
    lines = []
    words = text.split()
    while words:
        line = ''
        while words and font.getbbox(line + words[0])[2] <= max_width:
            line += (words.pop(0) + ' ')
        lines.append(line)
    y_offset = position[1]
    for line in lines:
        draw.text((position[0], y_offset), line, font=font, fill=fill)
        y_offset += font.getbbox(line)[3]  # Use getbbox for line height
    print(f"Wrapped text drawn at position {position}")
    return y_offset



**Album Art Download!**


In [43]:
# Function to fetch and download all popular album art for each artist
def download_all_album_art(df, top_artists):
    print("Downloading album art for top artists")
    album_art = {}
    for artist in top_artists:
        artist_data = df[df['artistName'] == artist]
        if artist_data.empty:
            continue
        
        art_urls = artist_data['album_artwork'].value_counts().index.tolist()
        downloaded = False
        for art_url in art_urls:
            try:
                response = requests.get(art_url)
                img = Image.open(BytesIO(response.content))
                
                img_path = os.path.join("albums", f'{artist}_album_art.jpg')
                img.save(img_path)
                
                album_art[artist] = img_path
                downloaded = True
                print(f"Downloaded album art for {artist}: {img_path}")
                break
            except Exception as e:
                print(f"Error downloading {art_url} for {artist}: {e}")
                continue
        if not downloaded:
            print(f"Could not download album art for {artist}")
    return album_art

In [44]:
from PIL import Image, ImageDraw, ImageFont

# Function to create and save layout images
def create_layout_image(title, top_artists, album_art, file_name, user_id, background):
    print(f"Creating layout image: {file_name}")
    width, height = background.size
    image = background.copy()
    draw = ImageDraw.Draw(image)

    # Define fonts
    try:
        font = ImageFont.truetype("arial.ttf", 40)
        title_font = ImageFont.truetype("arial.ttf", 60)
        user_id_font = ImageFont.truetype("arial.ttf", 30)
    except IOError:
        # In case the fonts are not available on the system
        font = ImageFont.load_default()
        title_font = ImageFont.load_default()
        user_id_font = ImageFont.load_default()
    
    # Draw title and user ID
    draw.text((width / 2, 50), title, font=title_font, fill="white", anchor="mm")
    draw.text((width / 2, 150), f"User: {user_id}", font=user_id_font, fill="white", anchor="mm")

    y_offset = 250
    x_offset = 50

    for rank, (artist, value) in enumerate(top_artists.items(), start=1):
        if artist not in album_art:
            continue
        
        art = Image.open(album_art[artist]).resize((100, 100))
        image.paste(art, (x_offset, y_offset))
        
        text = f"{rank}. {artist}: {value}"
        draw.text((x_offset + 120, y_offset + 30), text, font=font, fill="white")
        y_offset += 120

    image.save(file_name)
    print(f"Layout image saved as {file_name}")


In [None]:
df = read_filled_listening_data('ezra_listening_data_with_percentage.csv')

# Calculate total listening time per user
total_time = total_listening_time_per_user(df)
print("Total listening time per user (in hours):")
print(total_time, "\n")

# Identify the biggest listening date
biggest_date = biggest_listening_date(df)
print("Biggest listening date:")
print(biggest_date, "\n")

# Analyze top 5 music tastes per month
taste_per_month = music_taste_per_month(df)
print("Top 5 artists per month:")
for month, artists in taste_per_month.items():
    print(f"Month {month}: {artists}")
print("\n")

# Determine the most common listening days and times
common_days = common_listening_days_and_times(df)
print("Most common listening days:")
print(common_days, "\n")


    

    



<h3>Album Analysis Functions</h3>

In [None]:
def main():
    # Generate Instagram story-sized abstract background with Perlin noise and custom colormap
    background = generate_abstract_background_with_noise(1080, 1920)

    # Load unique songs data
    unique_songs_file = 'unique_songs.csv'
    unique_songs = pd.read_csv(unique_songs_file)

    # Get user ID and construct the file path
    user_name = get_user_id()
    file_path = f'{user_name}_listening_data.csv'

    try:
        # Read the filled listening data
        filled_listening_data = read_filled_listening_data(file_path)
        
        # Calculate percentage listened for each track
        filled_listening_data = calculate_percentage_listened(filled_listening_data)
        
        # Calculate top listened-to artists
        top_artists_count = top_artists_by_count(filled_listening_data).head(5)
        top_artists_time = top_artists_by_time(filled_listening_data).head(5)
        top_artists_weighted_time = top_artists_by_weighted_time(filled_listening_data).head(5)
        
        # Combine all top artists to ensure all album art is downloaded
        all_top_artists = top_artists_count.index.union(top_artists_time.index).union(top_artists_weighted_time.index)
        
        # Download the most common album art for each artist
        album_art = download_all_album_art(filled_listening_data, all_top_artists)
        
        # Create layout images with user ID in the file name and abstract background
        create_layout_image("Top Artists by Count", top_artists_count, album_art, f"{user_name}_spotify_wrapped_top_artists_count.png", user_name, background)
        create_layout_image("Top Artists by Listening Time (minutes)", {k: v / 60 for k, v in top_artists_time.items()}, album_art, f"{user_name}_spotify_wrapped_top_artists_time.png", user_name, background)
        create_layout_image("Top Artists by Weighted Listening Time", top_artists_weighted_time, album_art, f"{user_name}_spotify_wrapped_top_artists_weighted_time.png", user_name, background)
        
        print("Data processing and layout creation complete!")
    except FileNotFoundError:
        print(f"File not found: {file_path}")

# Run the main function
if __name__ == '__main__':
    main()