In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import nltk
from nltk.corpus import stopwords, words
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import re
from nltk.stem import PorterStemmer



In [2]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_lyrics(lyrics):
    import re
    from nltk.stem import PorterStemmer
    
    # Initialize stemmer
    stemmer = PorterStemmer()
    custom_stopwords = {"and", "the", "is", "in", "on", "you", "for", "with", "a", "an", "to", "of", "it"}
    # Remove non-alphabetic characters
    lyrics = re.sub(r"[^a-zA-Z\s]", "", lyrics)
    
    # Tokenize and preprocess
    words = lyrics.lower().split()
    processed_words = [stemmer.stem(word) for word in words if word not in custom_stopwords]
    return " ".join(processed_words)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
df1 = pd.read_csv('../data/3_ds3_cleaned.csv')
df1.columns
df = df1[['lyrics', 'tag']]
df.columns = ['lyrics', 'genre']
df.columns

Index(['lyrics', 'genre'], dtype='object')

In [4]:
data = {
    'lyrics': [
        "love you forever and dance all night",  # Pop
        "guitar riffs and loud drums shake the ground",  # Rock
        "spitting bars, beats on the street, life is tough",  # Hip-hop
        "broken heart, memories linger, crying alone",  # Pop
        "rebellion in the air, we fight for freedom",  # Rock
        "rhymes that hustle, words that flow, beats that kill"  # Hip-hop
    ],
    'genre': ['Pop', 'Rock', 'Hip-hop', 'Pop', 'Rock', 'Hip-hop']
}

df = pd.DataFrame(data)

df
df.columns

Index(['lyrics', 'genre'], dtype='object')

In [5]:
# Apply preprocessing to lyrics
df['lyrics'] = df['lyrics'].apply(preprocess_lyrics)

In [6]:
# Step 2: Separate Lyrics by Genre
grouped_lyrics = df.groupby('genre')['lyrics'].apply(lambda x: ' '.join(x)).to_dict()

In [7]:
# Step 3: Build Topic Models for Each Genre
vectorizers = {}
topic_models = {}

for genre, lyrics in grouped_lyrics.items():
    # Vectorize lyrics for the genre
    vectorizer = CountVectorizer(stop_words='english', max_features=50)
    X = vectorizer.fit_transform([lyrics])
    vectorizers[genre] = vectorizer

    # Fit LDA topic model
    lda = LatentDirichletAllocation(n_components=2, random_state=42, max_iter=100)
    lda.fit(X)
    topic_models[genre] = lda

In [8]:
def predict_genre(new_lyrics):
    scores = {}
    
    for genre, lda in topic_models.items():
        # Transform new lyrics using the genre's vectorizer
        vectorizer = vectorizers[genre]
        X_new = vectorizer.transform([new_lyrics])
        
        # Calculate log-likelihood of the new song under the genre's topic model
        log_likelihood = lda.score(X_new)
        scores[genre] = log_likelihood

    # Select genre with highest log-likelihood
    predicted_genre = max(scores, key=scores.get)
    return predicted_genre, scores

In [10]:
test_songs = [
    ("guitar and freedom in the air, fighting the world", "Rock"),
    ("dance and love under the stars all night", "Pop"),
    ("spitting rhymes and beats, life on the streets", "Hip-hop"),
    ("I have seen the morning burning golden on the mountain in the skies Aching with the feeling of the freedom of an eagle when she flies Turning ", "country")
]

predictions = []
for lyrics, true_genre in test_songs:
    predicted_genre, likelihoods = predict_genre(lyrics)
    predictions.append((lyrics, true_genre, predicted_genre, likelihoods))

# Display Results
for lyrics, true_genre, predicted_genre, likelihoods in predictions:
    print(f"Song: {lyrics}")
    print(f"True Genre: {true_genre}")
    print(f"Predicted Genre: {predicted_genre}")
    print(f"Log-Likelihoods: {likelihoods}")
    print("\n")


Song: guitar and freedom in the air, fighting the world
True Genre: Rock
Predicted Genre: Pop
Log-Likelihoods: {'Hip-hop': np.float64(-3.09354543753377), 'Pop': np.float64(-2.544199904709469), 'Rock': np.float64(-11.552484056709298)}


Song: dance and love under the stars all night
True Genre: Pop
Predicted Genre: Rock
Log-Likelihoods: {'Hip-hop': np.float64(-3.09354543753377), 'Pop': np.float64(-8.730526888808452), 'Rock': np.float64(-2.544199904709469)}


Song: spitting rhymes and beats, life on the streets
True Genre: Hip-hop
Predicted Genre: Pop
Log-Likelihoods: {'Hip-hop': np.float64(-6.509037702465479), 'Pop': np.float64(-2.544199904709469), 'Rock': np.float64(-2.544199904709469)}


Song: I have seen the morning burning golden on the mountain in the skies Aching with the feeling of the freedom of an eagle when she flies Turning 
True Genre: country
Predicted Genre: Pop
Log-Likelihoods: {'Hip-hop': np.float64(-3.09354543753377), 'Pop': np.float64(-2.544199904709469), 'Rock': np.fl