Step 6: This step ‘6_prediction_custom_unigram_models’ uses a pre-trained topic model and background model to classify song lyrics by genre. The process involves tokenizing and preprocessing lyrics, followed by calculating the log-probabilities of words for each genre using the topic and background models. For unseen words, a small fallback probability is applied. The genre with the highest log-probability is selected as the predicted genre. Additionally, the likelihood scores for all genres are normalized and converted into percentages for interpretability. Lastly, The notebook also includes a batch processing step to predict genres for the entire dataset. Finally, the predictions are evaluated against the true genres in the dataset, yielding an overall accuracy score to measure the model's performance.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import nltk
from nltk.corpus import stopwords, words
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import re
from nltk.stem import PorterStemmer
import json


In [2]:
# Load data
df = pd.read_csv('../data/3_ds3_cleaned.csv')

# Load models
with open('../data/models/background_model.json') as f:
    background_model = json.load(f)

with open('../data/models/topic_models.json') as f:  # Change this to topic_models_nostopwords.json if needed
    topic_models = json.load(f)

In [3]:
nltk.download('words')
nltk.download('stopwords')
valid_words = set(words.words())
stop_words = set(stopwords.words('english'))

# Mangle the words coming in
def tokenize(text, remove_stopwords=False, filter_non_words=True):
    words = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
    if filter_non_words:
        words = [word for word in words if word in valid_words]
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    return words


def predict_genre(new_lyrics, remove_stopwords=False, filter_non_words=True):
    scores = {}
    
    # Tokenize the lyrics
    words = tokenize(new_lyrics, remove_stopwords=remove_stopwords, filter_non_words=filter_non_words)
    
    for genre, word_probs in topic_models.items():
        log_prob = 0.0
        
        for word in words:
            # Use topic model probabilities or fallback to background model
            word_prob = word_probs.get(word, background_model.get(word, 1e-9))  # Small prob for unseen words
            log_prob += np.log(word_prob)
        
        scores[genre] = log_prob

    # Select the genre with the highest log-probability
    predicted_genre = max(scores, key=scores.get)
    return predicted_genre, scores

[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
test_songs = ["On warm summers evening, on a train bound for nowhere"]

predictions = []
for lyrics in test_songs:
    predicted_genre, likelihoods = predict_genre(lyrics)
    
    # Convert log-likelihoods to probabilities
    exp_likelihoods = {genre: np.exp(log_likelihood) for genre, log_likelihood in likelihoods.items()}
    total_exp = sum(exp_likelihoods.values())
    normalized_likelihoods = {genre: (prob / total_exp) * 100 for genre, prob in exp_likelihoods.items()}  # Convert to percentages
    
    predictions.append((lyrics, predicted_genre, normalized_likelihoods))

# Display Results
for lyrics, predicted_genre, likelihoods in predictions:
    print(f"Test Lyrics: {lyrics}")
    print(f"Predicted Genre: {predicted_genre}")
    print("Likelihoods as Percentages:")
    for genre, percent in likelihoods.items():
        print(f"  {genre}: {percent:.2f}%")
    print("\n")



Test Lyrics: On warm summers evening, on a train bound for nowhere
Predicted Genre: country
Likelihoods as Percentages:
  rock: 7.56%
  rb: 0.51%
  rap: 0.06%
  misc: 0.61%
  pop: 3.78%
  country: 87.49%




In [5]:
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

# Enable tqdm for pandas apply
tqdm.pandas()

# Apply the prediction function with tqdm's progress bar
df['predicted_genre'] = df['lyrics'].progress_apply(lambda lyrics: predict_genre(lyrics)[0])

# Calculate the accuracy
accuracy = accuracy_score(df['tag'], df['predicted_genre'])

print(f"Accuracy score: {accuracy}")


  0%|          | 0/67606 [00:00<?, ?it/s]

Accuracy score: 0.5842380853770376
