In [10]:
# Import libraries
import importlib
import yearly_data_preprocessing
importlib.reload(yearly_data_preprocessing)
import os
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

In [None]:
# Show current working directory in AI Studio
!pwd

/home/jovyan/local


In [None]:
from huggingface_hub import hf_hub_download

# Download the GGUF file from hugging face to the current directory if you don't already have this model
hf_hub_download(
    repo_id="codegood/gemma-2b-it-Q4_K_M-GGUF",
    filename="gemma-2b-it.Q4_K_M.gguf",
    local_dir=".",
)

In [None]:
# Initialize the Llama language model with the specified GGUF model file.
# The model is loaded from the local directory where the GGUF file is stored.
llm = Llama(
    model_path="./gemma-2b-it.Q4_K_M.gguf",
    n_gpu_layers=35,    # n_gpu_layers specifies how many layers to run on the GPU (set to 35 for faster inference if GPU is available).
    n_ctx=8192,         # n_ctx sets the maximum number of tokens (prompt + response) the model can handle in one inference.
    # use_mlock=True,   # optional: prevent swap
    verbose=False,      # verbose=False suppresses detailed logging output.
)

# ---------------- Context window size ----------------
# 'gemma.context_length' = Max tokens the model supports (prompt + output)
# This is the maximum number of tokens the model can process at once.
# n_ctx (in llama-cpp)	The number of tokens you allow during inference. It controls maximum prompt + response tokens
print("Context window size (n_ctx):", llm.n_ctx())

Context window size (n_ctx): 8192


In [None]:
# ---------------- Function to analyze lyrics ----------------
def get_avg_positivity_score(tags: list) -> float:
    """
    Calculate the average positivity score based on the provided tags.
    
    Args:
        tags (list): A list of tags representing emotions.
        
    Returns:
        float: The average positivity score.
    """
    positivity_dict = {
    'Joyful': 5, 'Melancholic': 2, 'Hopeful': 5, 'Angry': 1, 'Romantic': 4,
    'Nostalgic': 3, 'Sad': 1, 'Energetic': 4, 'Passionate': 4, 'Lonely': 1,
    'Uplifting': 5, 'Bittersweet': 3, 'Empowering': 5, 'Heartbroken': 1,
    'Reflective': 3, 'Playful': 4, 'Dark': 1, 'Calm': 4, 'Longing': 2, 'Triumphant': 5
    }
    scores = [positivity_dict.get(tag, 0) for tag in tags]
    valid_scores = [score for score in scores if score > 0]
    
    if valid_scores:
        return sum(valid_scores) / len(valid_scores)
    else:
        return 0.0
    
from typing import List, Tuple
# Function to analyze lyrics and return tags and average positivity score
# The function takes a string of lyrics as input and returns a tuple containing a list of emotion tags and the average positivity score.
def from_lyrics_to_positivity(lyrics: str) -> Tuple[List[str], float]:
    """
    Analyze the lyrics and return the average positivity score.
    
    Args:
        lyrics (str): The lyrics to analyze.
        
    Returns:
        float: The average positivity score of the lyrics.
    """
    prompt = """
        You are an expert in analyzing song lyrics to determine the emotions they convey.
        Analyze the following song lyrics and return exactly 3 emotion tags that best summarize the emotions conveyed by the song. Only output the tags, in this format: #tag1 #tag2 #tag3.
        The tags must be adjectives and strictly chosen from the following list: Joyful, Melancholic, Hopeful, Angry, Romantic, Nostalgic, Sad, Energetic, Passionate, Lonely, Uplifting, Bittersweet, Empowering, Heartbroken, Reflective, Playful, Dark, Calm, Longing, Triumphant
    '''{lyrics}'''
    """
    # along with weights assigning to each each tag that summing up to 100%
    response = llm(prompt.format(lyrics=lyrics), temperature=0.0, max_tokens=512, stop={"\n\n\n"})
    mood = response['choices'][0]['text'].strip().split('\n')
    # print(response, "\n************\n")
    # print(mood[0])
    tags_str = mood[0] if mood else ''
    tags = [tag.lstrip('#') for tag in tags_str.strip().split()]
    
    return tags, get_avg_positivity_score(tags)

In [None]:
# ---------------- Process yearly data ----------------
# Define the years for which you want to process the data
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
# Process the yearly data using the batch_preprocess_yearly_data function
# This function will preprocess the data for each year, extracting lyrics and calculating the average positivity score.
all_processed_songs_by_year = yearly_data_preprocessing.batch_preprocess_yearly_data('./processed_with_weeklyfreq', years)


Processed 55 songs from 2015.
Processed 795 songs from 2016.
Processed 4463 songs from 2017.
Processed 4833 songs from 2018.
Processed 7639 songs from 2019.
Processed 4876 songs from 2020.
Processed 5180 songs from 2021.
Processed 4941 songs from 2022.
Processed 5975 songs from 2023.
Processed 5813 songs from 2024.
Processed 2151 songs from 2025.


In [None]:
# ---------------- Sample weekly top 10 songs by year ----------------
# This section samples the top 10 songs for each week in each year based on the number of repeats in that week.
from collections import defaultdict
import heapq
import pickle

# Build a dictionary to hold the weekly top 10 songs data sample by year
weekly_top10_songs_by_year={}
for year in years:
    songs = all_processed_songs_by_year.get(year, [])
    print(len(songs), "songs found for the year ", year)
    if not songs:
        print("No songs found for the year", year)
        exit(1)
    # Filter songs to only those where first_occurrence_in_week is True
    songs = [song for song in songs if getattr(song, "first_occurrence_in_week", True)]
    print(len(songs), "songs with 1st weekly occurrence in", year)

    print(songs[0].repeats_this_week, "repeats_this_week for the first song in the year", year)
    # Group songs by week number

    week_to_songs = defaultdict(list)
    for song in songs:
        week = getattr(song, "week", None)
        if week is not None:
            week_to_songs[week].append(song)

    # For each week, keep only top 10 songs by repeats_this_week
    filtered_songs = []
    for week, week_songs in week_to_songs.items():
        top_10 = heapq.nlargest(10, week_songs, key=lambda s: getattr(s, "repeats_this_week", 0))
        print(f"Week {week}: {[s.repeats_this_week for s in top_10]}")
        filtered_songs.extend(top_10)
    
    weekly_top10_songs_by_year[year] = filtered_songs
    print(len(filtered_songs), "songs with 1st weekly occurrence in", year, "and top 10 by repeats_this_week")
print("Successfully sampled",len(weekly_top10_songs_by_year), "years with weekly top 10 songs")
# Save the sampled songs with their mood tags and positivity scores to a pickle file
with open("processed_sample_2016_2025_weekly_top10.pkl", "wb") as f:
    pickle.dump(weekly_top10_songs_by_year, f)

55 songs found for the year  2015
18 songs with 1st weekly occurrence in 2015
1 repeats_this_week for the first song in the year 2015
Week 48: [1, 1, 1]
Week 51: [1]
Week 52: [21, 17, 2, 1, 1, 1, 1, 1, 1, 1]
Week 53: [1, 1, 1, 1]
18 songs with 1st weekly occurrence in 2015 and top 10 by repeats_this_week
795 songs found for the year  2016
459 songs with 1st weekly occurrence in 2016
5 repeats_this_week for the first song in the year 2016
Week 20: [5, 3, 3, 3, 2, 2, 1, 1, 1, 1]
Week 21: [11, 10, 9, 9, 7, 6, 4, 3, 3, 2]
Week 22: [6, 5, 3, 2, 2, 2, 2, 2, 1, 1]
Week 23: [5, 4, 3, 2, 2, 2, 2, 2, 1, 1]
Week 24: [1, 1, 1, 1, 1]
Week 48: [21, 19, 15, 10, 7, 6, 3, 3, 3, 3]
Week 49: [10, 7, 6, 6, 5, 5, 5, 4, 4, 3]
Week 50: [7, 6, 4, 4, 3, 3, 2, 2, 2, 1]
Week 51: [16, 8, 7, 5, 4, 4, 4, 4, 4, 3]
Week 52: [4, 2, 2, 2, 2, 2, 2, 2, 1, 1]
95 songs with 1st weekly occurrence in 2016 and top 10 by repeats_this_week
4463 songs found for the year  2017
2185 songs with 1st weekly occurrence in 2017
1 repea

In [None]:
# ---------------- Calculate weekly weighted average positivity by year using local LLM model ----------------
# This section calculates the weekly weighted average positivity score for each year based on the sampled top 10 songs.
import json
years=[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
weekly_weighted_avg_positivity_by_year = {}
for year in years:
    songs = weekly_top10_songs_by_year.get(year, [])
    if not songs:
        print("No songs found for the year", year)
        exit(1)
        
    # Group songs by week number
    week_to_scores = defaultdict(lambda: {'weighted_sum': 0.0, 'repeats_sum': 0})
    for song in songs:
        song.mood_tags_local, song.positivity_score_local = from_lyrics_to_positivity(song.lyrics)
        song.positivity_score_local_wghted = song.positivity_score_local * song.repeats_this_week
        week = getattr(song, "week", None)
        if week is None:
            continue
        weighted_score = getattr(song, "positivity_score_local_wghted", 0.0)
        repeats = getattr(song, "repeats_this_week", 1)
        week_to_scores[week]['weighted_sum'] += weighted_score
        week_to_scores[week]['repeats_sum'] += repeats
    # Calculate weighted average for each week
    tmp=[0]*53  # Initialize a list for 53 weeks
    for week, data in week_to_scores.items():
        tmp[week - 1] = (data['weighted_sum'] / data['repeats_sum'] if data['repeats_sum'] > 0 else 0.0)
    weekly_weighted_avg_positivity_by_year[year] = tmp
    print("Succesfully processed year: ",year)
print("Successfully calculated", len(weekly_weighted_avg_positivity_by_year),"years of data")
# Save the results to a file, this will be a JSON file containing the weekly weighted average positivity scores for each year,
# with each week represented as a list of scores. This file can be used for further analysis or visualization in the web application.
output_file = './weekly_weighted_avg_positivity_by_year.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(weekly_weighted_avg_positivity_by_year, f, ensure_ascii=False, indent=4)

Succesfully processed year:  2016
Succesfully processed year:  2017
Succesfully processed year:  2018
Succesfully processed year:  2019
Succesfully processed year:  2020
Succesfully processed year:  2021
Succesfully processed year:  2022
Succesfully processed year:  2023
Succesfully processed year:  2024
Succesfully processed year:  2025
Successfully calculated 10 years of data


In [None]:
# Print the results
weekly_weighted_avg_positivity_by_year

{2016: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2.727272727272727,
  3.3020833333333335,
  3.3205128205128203,
  3.4583333333333335,
  3.4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2.2018518518518517,
  1.3878787878787877,
  2.441176470588235,
  1.7768361581920906,
  2.583333333333333,
  0],
 2017: [2.7142857142857144,
  3.2459677419354835,
  1.6536796536796536,
  1.075,
  3.3050314465408803,
  3.736842105263158,
  2.9074074074074074,
  2.1785714285714284,
  1.7424242424242424,
  2.388059701492537,
  2.5305164319248825,
  2.3833333333333337,
  2.806060606060606,
  2.3214285714285716,
  2.8488372093023258,
  3.467391304347826,
  3.8555555555555547,
  0.8647342995169082,
  3.3425076452599387,
  2.4,
  2.0925925925925926,
  2.8823529411764706,
  3.348958333333333,
  0.7058823529411765,
  2.2298850574712645,
  3.0714285714285707,
  4.345238095238096,
  4.141414

In [None]:
# ---------------- Sample mood tags for a specific year ----------------
# This section retrieves the mood tags for a specific year (2025 in this case) from the sampled weekly top 10 songs.
weekly_top10_songs_by_year[2025][6].mood_tags_local

['Hopeful', 'Uplifting', 'Resilient']

In [None]:
# ---------------- Save the processed data with mood tags and positivity scores ----------------
# Save the processed weekly top 10 songs with their mood tags and positivity scores to a pickle file. 
# This file can be used for further analysis or model training.
with open("processed_sample_2016_2025_weekly_top10_local_assigned.pkl", "wb") as f:
    pickle.dump(weekly_top10_songs_by_year, f)