In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import nltk
from nltk.corpus import stopwords, words
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import re
from nltk.stem import PorterStemmer
import json


In [21]:
# Load models
with open('../data/models/background_model.json') as f:
    background_model = json.load(f)

with open('../data/models/topic_models.json') as f:  # Change this to topic_models_nostopwords.json if needed
    topic_models = json.load(f)

In [22]:
nltk.download('words')
nltk.download('stopwords')
valid_words = set(words.words())
stop_words = set(stopwords.words('english'))

# Mangle the words coming in
def tokenize(text, remove_stopwords=False, filter_non_words=True):
    words = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
    if filter_non_words:
        words = [word for word in words if word in valid_words]
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    return words


def predict_genre(new_lyrics, remove_stopwords=False, filter_non_words=True):
    scores = {}
    
    # Tokenize the lyrics
    words = tokenize(new_lyrics, remove_stopwords=remove_stopwords, filter_non_words=filter_non_words)
    
    for genre, word_probs in topic_models.items():
        log_prob = 0.0
        
        for word in words:
            # Use topic model probabilities or fallback to background model
            word_prob = word_probs.get(word, background_model.get(word, 1e-9))  # Small prob for unseen words
            log_prob += np.log(word_prob)
        
        scores[genre] = log_prob

    # Select the genre with the highest log-probability
    predicted_genre = max(scores, key=scores.get)
    return predicted_genre, scores

[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
test_songs = [
    ("guitar and freedom in the air, fighting the world", "Rock"),
    ("dance and love under the stars all night", "Pop"),
    ("spitting rhymes and beats, life on the streets", "Hip-hop"),
    ("I have seen the morning burning golden on the mountain in the skies Aching with the feeling of the freedom of an eagle when she flies Turning ", "country")
]

predictions = []
for lyrics, true_genre in test_songs:
    predicted_genre, likelihoods = predict_genre(lyrics)
    predictions.append((lyrics, true_genre, predicted_genre, likelihoods))

# Display Results
for lyrics, true_genre, predicted_genre, likelihoods in predictions:
    print(f"Song: {lyrics}")
    print(f"True Genre: {true_genre}")
    print(f"Predicted Genre: {predicted_genre}")
    print(f"Log-Likelihoods: {likelihoods}")
    print("\n")


Song: guitar and freedom in the air, fighting the world
True Genre: Rock
Predicted Genre: rock
Log-Likelihoods: {'rock': np.float64(-54.7387339075022), 'rb': np.float64(-59.1924350853455), 'rap': np.float64(-59.61366399841251), 'misc': np.float64(-56.774743253423196), 'pop': np.float64(-55.96413841647568), 'country': np.float64(-54.74796360040183)}


Song: dance and love under the stars all night
True Genre: Pop
Predicted Genre: pop
Log-Likelihoods: {'rock': np.float64(-38.355537210402204), 'rb': np.float64(-38.565279137630064), 'rap': np.float64(-41.05787349061866), 'misc': np.float64(-41.500289990146854), 'pop': np.float64(-37.56322411660666), 'country': np.float64(-37.82187248328718)}


Song: spitting rhymes and beats, life on the streets
True Genre: Hip-hop
Predicted Genre: rap
Log-Likelihoods: {'rock': np.float64(-36.88738635172477), 'rb': np.float64(-38.65519664284065), 'rap': np.float64(-34.27214494842883), 'misc': np.float64(-39.28810351734895), 'pop': np.float64(-38.0073355698

In [None]:
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

# Enable tqdm for pandas apply
tqdm.pandas()

# Apply the prediction function with tqdm's progress bar
df1['predicted_genre'] = df1['lyrics'].progress_apply(lambda lyrics: predict_genre(lyrics)[0])

# Calculate the accuracy
accuracy = accuracy_score(df1['tag'], df1['predicted_genre'])

print(f"Accuracy score: {accuracy}")


  0%|          | 0/67606 [00:00<?, ?it/s]