In [11]:
# Import libraries
import yearly_data_preprocessing
import ground_truth
# import lyrics_positivity_analyzer
import os
from huggingface_hub import hf_hub_download


In [12]:
def get_avg_positivity_score(tags: list) -> float:
    """
    Calculate the average positivity score based on the provided tags.
    
    Args:
        tags (list): A list of tags representing emotions.
        
    Returns:
        float: The average positivity score.
    """
    positivity_dict = {
    'Joyful': 5, 'Melancholic': 2, 'Hopeful': 5, 'Angry': 1, 'Romantic': 4,
    'Nostalgic': 3, 'Sad': 1, 'Energetic': 4, 'Passionate': 4, 'Lonely': 1,
    'Uplifting': 5, 'Bittersweet': 3, 'Empowering': 5, 'Heartbroken': 1,
    'Reflective': 3, 'Playful': 4, 'Dark': 1, 'Calm': 4, 'Longing': 2, 'Triumphant': 5
    }
    scores = [positivity_dict.get(tag, 0) for tag in tags]
    valid_scores = [score for score in scores if score > 0]
    
    if valid_scores:
        return sum(valid_scores) / len(valid_scores)
    else:
        return 0.0
    
def from_lyrics_to_positivity(lyrics: str) -> [list[str], float]:
    """
    Analyze the lyrics and return the average positivity score.
    
    Args:
        lyrics (str): The lyrics to analyze.
        
    Returns:
        float: The average positivity score of the lyrics.
    """
    prompt = """
        You are an expert in analyzing song lyrics to determine the emotions they convey.
        Analyze the following song lyrics and return exactly 3 emotion tags that best summarize the emotions conveyed by the song. Only output the tags, in this format: #tag1 #tag2 #tag3.
        The tags must be adjectives and strictly chosen from the following list: Joyful, Melancholic, Hopeful, Angry, Romantic, Nostalgic, Sad, Energetic, Passionate, Lonely, Uplifting, Bittersweet, Empowering, Heartbroken, Reflective, Playful, Dark, Calm, Longing, Triumphant
    '''{lyrics}'''
    """
    # along with weights assigning to each each tag that summing up to 100%
    response = llm(prompt.format(lyrics=lyrics), temperature=0.0, max_tokens=512, stop={"\n\n\n"})
    mood = response['choices'][0]['text'].strip().split('\n')
    # print(response, "\n************\n")
    # print(mood[0])
    tags_str = mood[0] if mood else ''
    tags = [tag.lstrip('#') for tag in tags_str.strip().split()]
    
    return tags, get_avg_positivity_score(tags)

In [13]:
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
all_processed_songs_by_year = yearly_data_preprocessing.batch_preprocess_yearly_data(r'C:\Users\sarah\Documents\Github\music-mood-mirror\data\input_data\processed_with_weeklyfreq', years)


Processed 795 songs from 2016.
Processed 4463 songs from 2017.
Processed 4833 songs from 2018.
Processed 7639 songs from 2019.
Processed 4876 songs from 2020.
Processed 5180 songs from 2021.
Processed 4941 songs from 2022.
Processed 5975 songs from 2023.
Processed 5813 songs from 2024.
Processed 2151 songs from 2025.


In [None]:
# local LLM model

for year in years:
    songs = all_processed_songs_by_year.get(year, [])
    if not songs:
        print("No songs found for the year", year)
        exit(1)
    # Filter songs to only those where first_occurrence_in_week is True
    songs = [song for song in songs if getattr(song, "first_occurrence_in_week", True)]

    # # Take a random sample of 20 songs from songs_2024
    # sampled_songs = random.sample(songs_2024, min(20, len(songs_2024)))

    for song in songs:
        song.mood_tags_local, song.positivity_score_local = from_lyrics_to_positivity(song.lyrics)
        song.positivity_score_local_wghted = song.positivity_score_local * song.repeats_this_week
    print(songs[0])

In [None]:
from collections import defaultdict
import json

# aggregate the positivity scores by week for each year
weekly_weighted_avg_positivity_by_year = {}

for year in years:
    songs = all_processed_songs_by_year.get(year, [])
    # Group songs by week number
    week_to_scores = defaultdict(lambda: {'weighted_sum': 0.0, 'repeats_sum': 0})
    for song in songs:
        week = getattr(song, "week", None)
        if week is None:
            continue
        weighted_score = getattr(song, "positivity_score_local_wghted", 0.0)
        repeats = getattr(song, "repeats_this_week", 1)
        week_to_scores[week]['weighted_sum'] += weighted_score
        week_to_scores[week]['repeats_sum'] += repeats
    # Calculate weighted average for each week
    tmp=[0]*53  # Initialize a list for 53 weeks
    for week, data in week_to_scores.items():
        tmp[week - 1] = (data['weighted_sum'] / data['repeats_sum'] if data['repeats_sum'] > 0 else 0.0)
    weekly_weighted_avg_positivity_by_year[year] = tmp
# Save the results to a file
output_file = r'C:\Users\sarah\Documents\Github\music-mood-mirror\data\output_data\weekly_weighted_avg_positivity_by_year.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(weekly_weighted_avg_positivity_by_year, f, ensure_ascii=False, indent=4)