In [None]:
! pip install -r requirements.txt

In [None]:
pip install -U imbalanced-learn

In [3]:
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import fasttext
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

import gc

In [4]:
def nb_classification(train, label, vectorizer='bow', n_top_features=10, oversample=False):

    train = train.apply(' '.join)

    # split into train and test sets, with stratifying
    X_train, X_temp, y_train, y_temp = train_test_split(train, label, test_size=0.3, random_state=42, stratify=label)

    # Split the temporary test set into 50% test and 50% validation (15% of the total data each)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    if vectorizer == 'bow':
        vec = CountVectorizer()
    elif vectorizer == 'tfidf':
        vec = TfidfVectorizer()

    X_train_vec = vec.fit_transform(X_train)
    X_val_vec = vec.transform(X_val)
    X_test_vec = vec.transform(X_test)

    # Initialize the MultinomialNB
    nb = MultinomialNB()

    # Perform random oversampling if enabled
    if oversample:
        oversampler = RandomOverSampler(random_state=42)
        X_train_vec, y_train = oversampler.fit_resample(X_train_vec, y_train)

    # Fit the model to the training data
    nb.fit(X_train_vec, y_train)

    # Print the most informative features
    feature_names = vec.get_feature_names_out()
    for i, class_label in enumerate(nb.classes_):
        print(f"\nClass: {class_label}")
        top_features_idx = nb.feature_log_prob_[i].argsort()[-n_top_features:][::-1]
        top_features = [feature_names[idx] for idx in top_features_idx]
        print(", ".join(top_features))

    # Predict the labels of the test set
    y_test_pred = nb.predict(X_test_vec)
    y_val_pred = nb.predict(X_val_vec)

    # Generate classification report
    test_report = classification_report(y_test, y_test_pred)
    val_report = classification_report(y_val, y_val_pred)

    return test_report, val_report

In [6]:
def lr_classification(train, label, vectorizer_method='tfidf', oversample=False, n_top_features=10):

    train = train.apply(lambda x: ' '.join(x))
    
    # split into train and test sets, with stratifying
    X_train, X_temp, y_train, y_temp = train_test_split(train, label, test_size=0.3, random_state=42, stratify=label)

    # Split the temporary test set into 50% test and 50% validation (15% of the total data each)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Initialize vectorizer (TDIDF OR bag of words)
    if vectorizer_method == 'tfidf':
        vectorizer = TfidfVectorizer()
    elif vectorizer_method == 'bow':
        vectorizer = CountVectorizer()
    
    # Vectorize data
    X_train_vectors = vectorizer.fit_transform(X_train)
    X_val_vectors = vectorizer.transform(X_val)
    X_test_vectors = vectorizer.transform(X_test)

    # Perform oversampling
    if oversample == True:
        oversampler = RandomOverSampler(random_state=42)
        X_train_vectors, y_train = oversampler.fit_resample(X_train_vectors, y_train)

    # Initialize the Multinomial LR
    lr_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', n_jobs=-1)

    # Fit the model to the training data 
    lr_clf.fit(X_train_vectors, y_train)
  
    # Get the feature names from the vectorizer
    feature_names = vectorizer.get_feature_names_out()
  
    # Print the most informative features
    for i, label in enumerate(lr_clf.classes_):
        print(f"\nClass: {label}")
        top_features_idx = lr_clf.coef_[i].argsort()[-n_top_features:][::-1]
        top_features = [feature_names[idx] for idx in top_features_idx]
        print(", ".join(top_features))

    # Predict the labels of the test set
    y_test_pred = lr_clf.predict(X_test_vectors)
    y_val_pred = lr_clf.predict(X_val_vectors)

    # Generate classification report
    test_report = classification_report(y_test, y_test_pred)
    val_report = classification_report(y_val, y_val_pred)

    return test_report, val_report


### Lemmatized

In [25]:
df = pd.read_pickle('./pkl_files/lemmatized.pkl')

In [26]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,lemmatized_lyrics
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548,"[yeah, hah, yeah, rocafella, invite, somethin,..."
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574,"[maybe, cause, im, eatin, bastard, fiend, grub..."
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760,"[ugh, killa, baby, kanye, 1970s, heron, flow, ..."
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432,"[ask, young, boy, gon, second, time, around, g..."


In [27]:
%%time
# Naive Bayes & Bag of Words
lemmatized_nb_bow_test_report, lemmatized_nb_bow_val_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='bow')
print(lemmatized_nb_bow_test_report, lemmatized_nb_bow_val_report)


Class: country
im, love, like, know, dont, got, na, time, go, one

Class: pop
im, know, love, dont, like, na, oh, got, go, time

Class: rap
im, like, got, know, nigga, get, yeah, dont, bitch, shit

Class: rb
know, love, im, yeah, dont, like, got, baby, na, oh

Class: rock
im, know, dont, like, time, love, one, go, na, never
              precision    recall  f1-score   support

     country       0.33      0.17      0.22     12677
         pop       0.65      0.49      0.56    215359
         rap       0.83      0.79      0.81    149486
          rb       0.25      0.34      0.29     23802
        rock       0.39      0.62      0.48     95954

    accuracy                           0.59    497278
   macro avg       0.49      0.48      0.47    497278
weighted avg       0.63      0.59      0.60    497278
               precision    recall  f1-score   support

     country       0.32      0.17      0.22     12678
         pop       0.65      0.49      0.56    215358
         rap       0.

In [28]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [29]:
%%time
# Naive Bayes & TFIDF
lemmatized_nb_tfidf_test_report, lemmatized_nb_tfidf_val_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='tfidf')
print(lemmatized_nb_tfidf_test_report, lemmatized_nb_tfidf_val_report)


Class: country
love, im, know, dont, like, oh, time, got, one, na

Class: pop
love, im, know, dont, oh, like, na, youre, time, go

Class: rap
im, nigga, like, got, bitch, yeah, get, shit, dont, know

Class: rb
love, baby, yeah, know, im, dont, oh, na, got, like

Class: rock
im, know, dont, time, love, youre, never, one, oh, like
              precision    recall  f1-score   support

     country       0.00      0.00      0.00     12677
         pop       0.58      0.90      0.71    215359
         rap       0.80      0.83      0.82    149486
          rb       0.15      0.00      0.00     23802
        rock       0.70      0.05      0.09     95954

    accuracy                           0.65    497278
   macro avg       0.45      0.36      0.32    497278
weighted avg       0.63      0.65      0.57    497278
               precision    recall  f1-score   support

     country       0.00      0.00      0.00     12678
         pop       0.58      0.91      0.71    215358
         rap    

In [30]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [31]:
%%time
# Random Oversampling with Naive Bayes & Bag of Words
lemmatized_nb_ros_bow_test_report, lemmatized_nb_ros_bow_val_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='bow', oversample=True)
print(lemmatized_nb_ros_bow_test_report, lemmatized_nb_ros_bow_val_report)


Class: country
im, love, like, know, dont, got, na, time, go, one

Class: pop
im, know, love, dont, like, na, oh, got, go, time

Class: rap
im, like, got, know, nigga, get, yeah, dont, bitch, shit

Class: rb
know, love, im, yeah, dont, like, got, baby, na, oh

Class: rock
im, know, dont, like, time, love, one, go, never, oh
              precision    recall  f1-score   support

     country       0.13      0.67      0.22     12677
         pop       0.70      0.18      0.28    215359
         rap       0.86      0.75      0.80    149486
          rb       0.17      0.59      0.26     23802
        rock       0.38      0.65      0.48     95954

    accuracy                           0.47    497278
   macro avg       0.45      0.57      0.41    497278
weighted avg       0.65      0.47      0.47    497278
               precision    recall  f1-score   support

     country       0.13      0.68      0.22     12678
         pop       0.70      0.18      0.28    215358
         rap       0.

In [32]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [33]:
%%time
# Logistic regression & Bag of Words
lemmatized_lr_bow_test_report, lemmatized_lr_bow_val_report = lr_classification(df['lemmatized_lyrics'], df['tag'], vectorizer_method='bow', n_top_features=10)
print(lemmatized_lr_bow_test_report, lemmatized_lr_bow_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
lonesome, porch, whiskey, guitar, heartache, truck, tennessee, texas, tailgate, hed

Class: pop
fcking, fk, rap, chuckle, fck, pre, fking, spoken, nigga, evry

Class: rap
snippet, rapping, rapper, lyric, rap, intro, fam, spitting, depression, opps

Class: rb
pre, outro, tryna, trynna, 2x, hook, nigga, imma, intro, shawty

Class: rock
thе, endless, fz, disease, guitar, punk, failure, decay, drag, collapse
              precision    recall  f1-score   support

     country       0.47      0.11      0.18     12677
         pop       0.61      0.87      0.71    215359
         rap       0.87      0.82      0.85    149486
          rb       0.41      0.08      0.14     23802
        rock       0.58      0.24      0.34     95954

    accuracy                           0.68    497278
   macro avg       0.59      0.43      0.44    497278
weighted avg       0.67      0.68      0.64    497278
               precision    recall  f1-score   support

     country       0.47      0.1

In [34]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [35]:
%%time
# Logistic regression & TDIDF
lemmatized_lr_tfidf_test_report, lemmatized_lr_tfidf_val_report = lr_classification(df['lemmatized_lyrics'], df['tag'], vectorizer_method='tfidf', n_top_features=10)
print(lemmatized_lr_tfidf_test_report, lemmatized_lr_tfidf_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
whiskey, country, truck, beer, cowboy, old, town, id, heartache, lonesome

Class: pop
nigga, fcking, repeat, pre, fck, yo, co, dey, spoken, fk

Class: rap
rap, rapper, rhyme, nigga, li, hook, bro, gang, lil, bitch

Class: rb
tryna, hook, ima, nigga, imma, pre, vibe, shit, x2, shawty

Class: rock
thе, fucking, well, punk, guitar, sick, drag, goddamn, death, blood
              precision    recall  f1-score   support

     country       0.51      0.13      0.21     12677
         pop       0.62      0.85      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.46      0.09      0.15     23802
        rock       0.57      0.27      0.37     95954

    accuracy                           0.68    497278
   macro avg       0.61      0.44      0.46    497278
weighted avg       0.67      0.68      0.65    497278
               precision    recall  f1-score   support

     country       0.49      0.13      0.21     12678
         pop       0.

In [36]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [37]:
%%time
# Random Oversampling with Logistic regression & TDIDF
lemmatized_lr_ros_tfidf_test_report, lemmatized_lr_ros_tfidf_val_report = lr_classification(df['lemmatized_lyrics'], df['tag'], oversample=True, vectorizer_method='tfidf', n_top_features=10)
print(lemmatized_lr_ros_tfidf_test_report, lemmatized_lr_ros_tfidf_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
truck, whiskey, beer, cowboy, bar, ol, porch, instrumental, country, tennessee

Class: pop
nigga, co, repeat, bitch, yo, spoken, club, pre, fcking, colour

Class: rap
rap, rapper, tryna, hook, rhyme, nigga, li, bro, imma, mic

Class: rb
tryna, hook, pre, nigga, imma, ima, vibe, yo, outro, yall

Class: rock
punk, disease, drag, teeth, void, guitar, fucking, destroy, failure, thе
              precision    recall  f1-score   support

     country       0.15      0.70      0.24     12677
         pop       0.69      0.30      0.42    215359
         rap       0.89      0.79      0.83    149486
          rb       0.18      0.60      0.28     23802
        rock       0.43      0.60      0.50     95954

    accuracy                           0.53    497278
   macro avg       0.47      0.60      0.46    497278
weighted avg       0.66      0.53      0.55    497278
               precision    recall  f1-score   support

     country       0.15      0.70      0.24     12678
     