In [1]:
import numpy as np
import pandas as pd
import json

In [2]:

# Load the topic and background models along with weights from a JSON file.

# Open the JSON file
with open('../data/models/topic_models.json', 'r') as file:
    data = file.read()
# Parse JSON data into a dictionary
topic_models = json.loads(data)

with open('../data/models/background_model.json', 'r') as file:
    data = file.read()

background_models = json.loads(data)

In [3]:
# Calculate Weights, Assume equal weights
num_genres = len(topic_models)
weight = 1 / (num_genres + 1)  # Equal weight for each topic and background
print("Weight :" + str(weight))

Weight :0.14285714285714285


In [4]:
def calculate_likelihood(lyrics, topics, background, topic_weights, background_weight):
    
    # Calculate the likelihood of the lyrics under each genre.
    
    # Args:
    # lyrics: dict, {word: count}, bag-of-words representation of the song lyrics
    # topics: dict, {genre: {word: prob}}, unigram distributions for each genre/topic
    # background: dict, {word: prob}, unigram distribution for the background model
    # topic_weights: dict, {genre: float}, weights for each topic (sum of all topic_weights < 1)
    # background_weight: float, weight for the background model (1 - sum(topic_weights))
    
    # Returns:
    # dict: {genre: likelihood}, likelihood of the lyrics under each genre
    
    genre_likelihoods = {}

    for genre, topic_dist in topics.items():
        print(genre)
        # print(topic_dist)
        likelihood = 0
        # print(lyrics)
        for word, count in lyrics.items():
            # print("word:   " + str(word) + "   count:    " + str(count))
            # Get probabilities for the word from the topic and background distributions
            topic_prob = topic_dist.get(word, 1e-9)  # Small value to avoid zero probabilities
            background_prob = background.get(word, 1e-9)  # Small value to avoid zero probabilities
            
            # Calculate the weighted mixture probability
            mix_prob = topic_weights * topic_prob + background_weight * background_prob
            likelihood += count * np.log(mix_prob)
        
        # Store the likelihood for this genre
        genre_likelihoods[genre] = likelihood
    
    return genre_likelihoods

In [8]:
import re
def tokenize(lyrics):
    words = re.findall(r'\b[a-zA-Z]{2,}\b', lyrics.lower())
    return words

In [9]:
def list_word_count(word_list):
    """
    Convert a list of words to a dictionary with word counts (manual approach).
    
    Args:
    word_list: list, a list of words (strings)
    
    Returns:
    dict: {word: count}, a dictionary with words as keys and their counts as values
    """
    word_count = {}
    for word in word_list:
        word_count[word] = word_count.get(word, 0) + 1
    return word_count


In [10]:
df = pd.read_csv('../data/3_ds3_cleaned.csv')
# lyrics_text= df.iloc[0]['lyrics']
lyrics = ' love love love  love love'
lyrics = list_word_count(tokenize(lyrics))
print(lyrics)

{'love': 5}


In [11]:
# priors = {}
# lyrics = list_word_count(tokenize(lyrics_text))

genre_likelihoods = calculate_likelihood(lyrics, topic_models, background_models, weight, weight)
print(genre_likelihoods)

predicted_genre = max(genre_likelihoods, key=genre_likelihoods.get)

# # Predict genre
# predicted_genre, posteriors = predict_genre(lyrics, topic_models, background_models, weight, weight)

# # Output
print(f"Predicted Genre: {predicted_genre}")
# print("Posterior Probabilities:", posteriors)

rock
rb
rap
misc
pop
country
{'rock': np.float64(-33.74837573914691), 'rb': np.float64(-30.934862861372707), 'rap': np.float64(-34.601047508307424), 'misc': np.float64(-35.93871064318955), 'pop': np.float64(-32.07165682461102), 'country': np.float64(-32.24920193918251)}
Predicted Genre: rb
