In [20]:
# Import libraries
import importlib
import yearly_data_preprocessing
importlib.reload(yearly_data_preprocessing)
import ground_truth
# import lyrics_positivity_analyzer
import os
from huggingface_hub import hf_hub_download


In [21]:
def get_avg_positivity_score(tags: list) -> float:
    """
    Calculate the average positivity score based on the provided tags.
    
    Args:
        tags (list): A list of tags representing emotions.
        
    Returns:
        float: The average positivity score.
    """
    positivity_dict = {
    'Joyful': 5, 'Melancholic': 2, 'Hopeful': 5, 'Angry': 1, 'Romantic': 4,
    'Nostalgic': 3, 'Sad': 1, 'Energetic': 4, 'Passionate': 4, 'Lonely': 1,
    'Uplifting': 5, 'Bittersweet': 3, 'Empowering': 5, 'Heartbroken': 1,
    'Reflective': 3, 'Playful': 4, 'Dark': 1, 'Calm': 4, 'Longing': 2, 'Triumphant': 5
    }
    scores = [positivity_dict.get(tag, 0) for tag in tags]
    valid_scores = [score for score in scores if score > 0]
    
    if valid_scores:
        return sum(valid_scores) / len(valid_scores)
    else:
        return 0.0
    
def from_lyrics_to_positivity(lyrics: str) -> [list[str], float]:
    """
    Analyze the lyrics and return the average positivity score.
    
    Args:
        lyrics (str): The lyrics to analyze.
        
    Returns:
        float: The average positivity score of the lyrics.
    """
    prompt = """
        You are an expert in analyzing song lyrics to determine the emotions they convey.
        Analyze the following song lyrics and return exactly 3 emotion tags that best summarize the emotions conveyed by the song. Only output the tags, in this format: #tag1 #tag2 #tag3.
        The tags must be adjectives and strictly chosen from the following list: Joyful, Melancholic, Hopeful, Angry, Romantic, Nostalgic, Sad, Energetic, Passionate, Lonely, Uplifting, Bittersweet, Empowering, Heartbroken, Reflective, Playful, Dark, Calm, Longing, Triumphant
    '''{lyrics}'''
    """
    # along with weights assigning to each each tag that summing up to 100%
    response = llm(prompt.format(lyrics=lyrics), temperature=0.0, max_tokens=512, stop={"\n\n\n"})
    mood = response['choices'][0]['text'].strip().split('\n')
    # print(response, "\n************\n")
    # print(mood[0])
    tags_str = mood[0] if mood else ''
    tags = [tag.lstrip('#') for tag in tags_str.strip().split()]
    
    return tags, get_avg_positivity_score(tags)

In [22]:
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
all_processed_songs_by_year = yearly_data_preprocessing.batch_preprocess_yearly_data(r'C:\Users\sarah\Documents\Github\music-mood-mirror\data\input_data\processed_with_weeklyfreq', years)


Processed 795 songs from 2016.
Processed 4463 songs from 2017.
Processed 4833 songs from 2018.
Processed 7639 songs from 2019.
Processed 4876 songs from 2020.
Processed 5180 songs from 2021.
Processed 4941 songs from 2022.
Processed 5975 songs from 2023.
Processed 5813 songs from 2024.
Processed 2151 songs from 2025.


In [None]:
from collections import defaultdict
import heapq
import pickle

# Build a dictionary to hold the weekly top 10 songs data sample by year
weekly_top10_songs_by_year={}
for year in years:
    songs = all_processed_songs_by_year.get(year, [])
    print(len(songs), "songs found for the year ", year)
    if not songs:
        print("No songs found for the year", year)
        exit(1)
    # Filter songs to only those where first_occurrence_in_week is True
    songs = [song for song in songs if getattr(song, "first_occurrence_in_week", True)]
    print(len(songs), "songs with 1st weekly occurrence in", year)

    print(songs[0].repeats_this_week, "repeats_this_week for the first song in the year", year)
    # Group songs by week number

    week_to_songs = defaultdict(list)
    for song in songs:
        week = getattr(song, "week", None)
        if week is not None:
            week_to_songs[week].append(song)

    # For each week, keep only top 10 songs by repeats_this_week
    filtered_songs = []
    for week, week_songs in week_to_songs.items():
        top_10 = heapq.nlargest(10, week_songs, key=lambda s: getattr(s, "repeats_this_week", 0))
        print(f"Week {week}: {[s.repeats_this_week for s in top_10]}")
        filtered_songs.extend(top_10)
    
    weekly_top10_songs_by_year[year] = filtered_songs
    print(len(filtered_songs), "songs with 1st weekly occurrence in", year, "and top 10 by repeats_this_week")
print("Successfully sampled",len(weekly_top10_songs_by_year), "years with weekly top 10 songs")
# Save the sampled songs with their mood tags and positivity scores to a pickle file
with open("processed_sample_2016_2025.pkl", "wb") as f:
    pickle.dump(weekly_top10_songs_by_year, f)

795 songs found for the year  2016
459 songs with 1st weekly occurrence in 2016
5 repeats_this_week for the first song in the year 2016
Week 20: [5, 3, 3, 3, 2, 2, 1, 1, 1, 1]
Week 21: [11, 10, 9, 9, 7, 6, 4, 3, 3, 2]
Week 22: [6, 5, 3, 2, 2, 2, 2, 2, 1, 1]
Week 23: [5, 4, 3, 2, 2, 2, 2, 2, 1, 1]
Week 24: [1, 1, 1, 1, 1]
Week 48: [21, 19, 15, 10, 7, 6, 3, 3, 3, 3]
Week 49: [10, 7, 6, 6, 5, 5, 5, 4, 4, 3]
Week 50: [7, 6, 4, 4, 3, 3, 2, 2, 2, 1]
Week 51: [16, 8, 7, 5, 4, 4, 4, 4, 4, 3]
Week 52: [4, 2, 2, 2, 2, 2, 2, 2, 1, 1]
95 songs with 1st weekly occurrence in 2016 and top 10 by repeats_this_week
4463 songs found for the year  2017
2185 songs with 1st weekly occurrence in 2017
1 repeats_this_week for the first song in the year 2017
Week 52: [4, 4, 3, 2, 2, 2, 2, 1, 1, 1]
Week 1: [9, 5, 5, 4, 4, 3, 3, 3, 3, 3]
Week 2: [26, 16, 16, 12, 11, 11, 10, 8, 7, 7]
Week 3: [22, 11, 9, 7, 6, 6, 5, 4, 4, 3]
Week 4: [5, 5, 5, 4, 4, 4, 4, 3, 3, 3]
Week 5: [11, 7, 6, 6, 5, 5, 4, 3, 3, 3]
Week 6: [5, 

In [None]:

# test by year
from collections import defaultdict
import heapq
year = 2023
songs = all_processed_songs_by_year.get(year, [])
print(len(songs), "songs found for the year ", year)
if not songs:
    print("No songs found for the year", year)
    exit(1)
# Filter songs to only those where first_occurrence_in_week is True
songs = [song for song in songs if getattr(song, "first_occurrence_in_week", True)]
print(len(songs), "songs with 1st weekly occurrence in ", year)

print(songs[0].repeats_this_week, "repeats_this_week for the first song in the year", year)
# Group songs by week number

week_to_songs = defaultdict(list)
for song in songs:
    week = getattr(song, "week", None)
    if week is not None:
        week_to_songs[week].append(song)

# For each week, keep only top 10 songs by repeats_this_week
filtered_songs = []
for week, week_songs in week_to_songs.items():
    top_10 = heapq.nlargest(10, week_songs, key=lambda s: getattr(s, "repeats_this_week", 0))
    print(f"Week {week}: {[s.repeats_this_week for s in top_10]}")
    filtered_songs.extend(top_10)

songs = filtered_songs
print(len(songs), "songs with 1st weekly occurrence in ", year, "and top 10 by repeats_this_week")

5975 songs found for the year  2023
3332 songs with 1st weekly occurrence in  2023
1 repeats_this_week for the first song in the year 2023
Week 52: [2, 2, 2, 2, 2, 2, 1, 1, 1, 1]
Week 1: [3, 2, 2, 2, 2, 1, 1, 1, 1, 1]
Week 2: [3, 2, 2, 2, 2, 1, 1, 1, 1, 1]
Week 3: [4, 3, 3, 2, 2, 2, 2, 2, 2, 2]
Week 4: [3, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Week 5: [9, 7, 4, 3, 3, 2, 2, 2, 2, 2]
Week 6: [5, 4, 3, 1, 1, 1, 1, 1, 1, 1]
Week 7: [12, 12, 8, 8, 7, 6, 5, 5, 4, 4]
Week 8: [5, 5, 4, 4, 4, 4, 4, 3, 3, 3]
Week 9: [7, 6, 6, 4, 4, 3, 3, 3, 3, 3]
Week 10: [11, 11, 9, 9, 7, 6, 6, 6, 5, 5]
Week 11: [9, 6, 4, 3, 3, 3, 3, 3, 3, 3]
Week 12: [29, 22, 12, 10, 5, 4, 4, 4, 4, 3]
Week 13: [12, 8, 7, 5, 5, 4, 4, 4, 3, 3]
Week 14: [24, 6, 5, 3, 3, 3, 3, 2, 2, 2]
Week 15: [8, 7, 4, 3, 3, 3, 3, 1, 1, 1]
Week 16: [21, 2, 2, 1, 1, 1, 1, 1, 1, 1]
Week 17: [14, 3, 3, 3, 3, 3, 3, 2, 2, 2]
Week 18: [9, 8, 8, 6, 6, 5, 5, 4, 4, 4]
Week 19: [14, 13, 8, 6, 6, 5, 5, 5, 4, 4]
Week 20: [7, 6, 5, 4, 3, 3, 3, 2, 2, 2]
Week 21: [19, 13

[SongEntry(album='Эстетика Грустных Людей', artist='Mirèle', duration=173.0, first_occurrence_in_week=True, lyrics='Делаю из чувств искусство\nВсё, что мне от тебя нужно\nДрама, слёзы, всё сняла\nТолько я не поняла\nКак нечаянно\nМы прятали любовь\nОбъясни мне почему\nПочему, почему мы\n\nНа землю падали\nСмотрели только в небо\nНу, а как ещё терпеть\nТакого не было и вот опять\nНаверно нужно\nПросыпаться скорей\nЭстетика грустных людей\n\nНа землю падали\nСмотрели только в небо\nНу, а как ещё терпеть\nТакого не было и вот опять\nНаверно нужно\nПросыпаться скорей\nЭстетика грустных людей\n\nНа землю падали\nСмотрели только в небо\nНу, а как ещё терпеть\nТакого не было и вот опять\nНаверно нужно\nПросыпаться скорей\nЭстетика грустных людей\n\nВыбираю пострадать\nЧтобы больше было музе\nВдохновения писать\nВедь любовь и боль в союзе\nТак неправильно\nРомантики здесь нет\nПовторяется мой сюжет\nСнова я, снова ты\nПочему мы\n\nНа землю падали\nСмотрели только в небо\nНу, а как ещё терпеть\

In [None]:
from collections import defaultdict
import json

# aggregate the positivity scores by week for each year
weekly_weighted_avg_positivity_by_year = {}

for year in years:
    songs = all_processed_songs_by_year.get(year, [])
    # Group songs by week number
    week_to_scores = defaultdict(lambda: {'weighted_sum': 0.0, 'repeats_sum': 0})
    for song in songs:
        week = getattr(song, "week", None)
        if week is None:
            continue
        weighted_score = getattr(song, "positivity_score_local_wghted", 0.0)
        repeats = getattr(song, "repeats_this_week", 1)
        week_to_scores[week]['weighted_sum'] += weighted_score
        week_to_scores[week]['repeats_sum'] += repeats
    # Calculate weighted average for each week
    tmp=[0]*53  # Initialize a list for 53 weeks
    for week, data in week_to_scores.items():
        tmp[week - 1] = (data['weighted_sum'] / data['repeats_sum'] if data['repeats_sum'] > 0 else 0.0)
    weekly_weighted_avg_positivity_by_year[year] = tmp
# Save the results to a file
output_file = r'C:\Users\sarah\Documents\Github\music-mood-mirror\data\output_data\weekly_weighted_avg_positivity_by_year.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(weekly_weighted_avg_positivity_by_year, f, ensure_ascii=False, indent=4)