In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from joblib import load

# Constants
MODEL_PATH = "trained_topic_model.joblib"  # Replace with your trained model file
CHUNK_SIZE = 1000  # Number of rows to process at a time
INPUT_CSV = ""  # Input file with lyrics data
OUTPUT_CSV = "genre_predictions.csv"  # Output file with genre predictions
GENRES = ["rock", "pop", "rap", "rb", "country"]  # List of genres from the trained model

In [2]:
def predict_genre(model, text_data):
    """
    Predict the genre for each song lyric based on the trained model.
    :param model: Trained topic model with per-topic word probabilities.
    :param text_data: List of song lyrics to classify.
    :return: List of predicted genres.
    """
    vectorizer = CountVectorizer()
    word_counts = vectorizer.fit_transform(text_data)
    word_probs = normalize(word_counts, norm='l1', axis=1)  # Normalize for probabilities

    # Predict the topic (genre) for each document
    predictions = []
    for doc in word_probs:
        topic_probs = model.transform(doc)  # Predict topic probabilities
        predicted_genre = GENRES[np.argmax(topic_probs)]  # Map highest probability to genre
        predictions.append(predicted_genre)
    return predictions

In [3]:
def process_chunks(input_csv, model, output_csv, chunk_size):
    """
    Process the CSV file in chunks to classify genres for each song lyric.
    :param input_csv: Input CSV file with song lyrics.
    :param model: Trained topic model.
    :param output_csv: Output CSV file with genre predictions.
    :param chunk_size: Number of rows to process per chunk.
    """
    # Open output CSV to append predictions
    with pd.ExcelWriter(output_csv, engine='openpyxl', mode='a') as writer:
        for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
            # Ensure necessary columns exist
            if 'lyrics' not in chunk.columns:
                raise ValueError("Input CSV must contain a 'lyrics' column.")

            # Predict genres for the chunk
            chunk['predicted_genre'] = predict_genre(model, chunk['lyrics'].astype(str))

            # Append the chunk to the output file
            chunk.to_excel(writer,index=True)

In [4]:
def main():
    # Load the trained topic model
    print("Loading trained topic model...")
    model = load(MODEL_PATH)

    # Process data in chunks
    print(f"Processing data from {INPUT_CSV} in chunks...")
    process_chunks(INPUT_CSV, model, OUTPUT_CSV, CHUNK_SIZE)

    print(f"Genre predictions saved to {OUTPUT_CSV}.")

In [None]:
if __name__ == "__main__":
    main()