In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords, words
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import re


# Initialize counters
topic_word_counts = defaultdict(Counter)
background_word_count = Counter()

# Load CSV and drop rows with NaN in 'tag' or 'lyrics' columns
df = pd.read_csv('../data/2_ds2_trimmed.csv')
df = df.dropna(subset=['tag', 'lyrics'])

print(df.head(2))

        title   tag    artist  year  views        features  \
0  Revelation  rock  Zardonic  2018   6680              {}   
1  Robitussin    rb   OPENPAD  2017     94  {"Rossi Rock"}   

                                              lyrics       id  
0  [Intro]\n(Try to do it like this, you won't ge...  3849758  
1  Saucalini:\n\nBaby what you want, what you nee...  3387226  


In [2]:
nltk.download('words')
nltk.download('stopwords')
valid_words = set(words.words())
stop_words = set(stopwords.words('english'))

# Mangle the words coming in
def tokenize(text, remove_stopwords=False, filter_non_words=True):
    words = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
    if filter_non_words:
        words = [word for word in words if word in valid_words]
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    return words

# Count words by topic and overall with a progress bar
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    topic, lyrics = row['tag'], row['lyrics']
    # Tokenize lyrics with stopword removal for topic counts
    topic_words = tokenize(lyrics, remove_stopwords=True)
    topic_word_counts[topic].update(topic_words)
    # Tokenize without stopword removal for background count
    background_words = tokenize(lyrics)
    background_word_count.update(background_words)

[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Processing rows:   0%|          | 0/99880 [00:00<?, ?it/s]

In [17]:
# Calculate unigram probabilities per topic
topic_models = {}

for topic, word_counts in tqdm(topic_word_counts.items(), desc="Calculating topic models"):
    topic_models[topic] = {}
    total_topic_word_count = sum(word_counts.values())
    for word, count in word_counts.items():
        topic_models[topic][word] = count / total_topic_word_count
        

# Display Result

# Loop through each topic and display the top 10 words by count ratio
for topic, words in topic_models.items():
    # Sort words by their values in descending order and get the top 10
    top_words = sorted(words.items(), key=lambda x: x[1], reverse=True)[:10]
    
    # Print topic and the top 10 words with their values
    print(f"Top 10 words for topic '{topic}':")
    for word, value in top_words:
        print(f"  {word}: {value}")
    print()  # Add a newline for readability between topics

Calculating topic models:   0%|          | 0/6 [00:00<?, ?it/s]

Top 10 words for topic 'rock':
  chorus: 0.012137634829860904
  know: 0.0107365106657743
  verse: 0.009982896309556845
  oh: 0.00923200258278422
  like: 0.009222026941486515
  love: 0.007897080401855107
  time: 0.007860805342590729
  never: 0.0076159486925561765
  one: 0.007324841341959543
  go: 0.007094494715630742

Top 10 words for topic 'rb':
  love: 0.019655394236190886
  yeah: 0.019618686675697126
  know: 0.019276885128804423
  oh: 0.017026892199523042
  baby: 0.015485776422071955
  like: 0.015179478909099443
  chorus: 0.013229765859922745
  got: 0.011544828656930538
  verse: 0.010720412954037922
  get: 0.0097058400525219

Top 10 words for topic 'rap':
  like: 0.017725415299780178
  de: 0.012310372273715686
  yeah: 0.012278771767317571
  la: 0.011865990152492194
  got: 0.011346063070660708
  know: 0.010127468542683397
  get: 0.009767025266579897
  verse: 0.007500182690427614
  bitch: 0.006718563914986738
  ich: 0.006329482679959946

Top 10 words for topic 'misc':
  one: 0.00770667

In [3]:
# Calculate background model probabilities
total_background_count = sum(background_word_count.values())
background_model = {word: count / total_background_count for word, count in background_word_count.items()}

# Display results
# Sort and print the top 10 words by frequency
top_10_background = sorted(background_model.items(), key=lambda x: x[1], reverse=True)[:10]
for word, frequency in top_10_background:
    print(f"{word}: {frequency:.4f}")

the: 0.0497
you: 0.0284
and: 0.0278
to: 0.0260
of: 0.0204
in: 0.0168
it: 0.0166
that: 0.0138
me: 0.0138
my: 0.0121


In [20]:
import json

topic_models_path = '../data/models/topic_models.json'
background_model_path = '../data/models/background_model.json'

# Save topic_models as JSON
with open(topic_models_path, 'w') as f:
    json.dump(topic_models, f, indent=4)  # `indent=4` makes it human-readable

# Save background_model as JSON
with open(background_model_path, 'w') as f:
    json.dump(background_model, f, indent=4)
