In [1]:
! pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [2]:
pip install -U imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import fasttext
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

import gc

In [4]:
def nb_classification(train, label, vectorizer='bow'):

    train = train.apply(' '.join)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=42)

    if vectorizer == 'bow':
        vec = CountVectorizer()
    elif vectorizer == 'tfidf':
        vec = TfidfVectorizer()

    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)

    # Initialize the MultinomialNB
    nb = MultinomialNB()

    # Fit the model to the training data
    nb.fit(X_train_vec, y_train)

    # Predict the labels of the test set
    y_pred = nb.predict(X_test_vec)

    # Generate classification report
    report = classification_report(y_test, y_pred)
    return report

    

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

def lr_classification(train, label, oversample=False, n_top_features=10):

    train = train.apply(lambda x: ' '.join(x))
    
    # split into train and test sets, with stratifying
    X_train, X_temp, y_train, y_temp = train_test_split(train, label, test_size=0.3, random_state=42, stratify=label)

    # Split the temporary test set into 50% test and 50% validation (15% of the total data each)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Initialize TDIDF vectorizer
    vectorizer = TfidfVectorizer()

    # Vectorize data
    X_train_vectors = vectorizer.fit_transform(X_train)
    X_val_vectors = vectorizer.transform(X_val)
    X_test_vectors = vectorizer.transform(X_test)

    # Perform oversampling
    if oversample == True:
        oversampler = RandomOverSampler(random_state=42)
        X_train_vectors, y_train = oversampler.fit_resample(X_train_vectors, y_train)

    # Initialize the Multinomial LR
    lr_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', n_jobs=-1)

    # Fit the model to the training data 
    lr_clf.fit(X_train_vectors, y_train)

    # Get the feature names from the vectorizer
    feature_names = vectorizer.get_feature_names_out()
  
    # Print the most informative features
    for i, label in enumerate(lr_clf.classes_):
        print(f"\nClass: {label}")
        top_features_idx = lr_clf.coef_[i].argsort()[-n_top_features:][::-1]
        top_features = [feature_names[idx] for idx in top_features_idx]
        print(", ".join(top_features))

    # Predict the labels of the test set
    y_test_pred = lr_clf.predict(X_test_vectors)
    y_val_pred = lr_clf.predict(X_val_vectors)

    # Generate classification report
    test_report = classification_report(y_test, y_test_pred)
    val_report = classification_report(y_val, y_val_pred)

    return test_report, val_report


### Tokenized

In [5]:
df = pd.read_pickle('./pkl_files/tokenized.pkl')

In [34]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432


In [9]:
#bow
bow_report = nb_classification(df['lyrics'], df['tag'], vectorizer='bow')
print(bow_report)

              precision    recall  f1-score   support

     country       0.33      0.18      0.23     25477
         pop       0.65      0.49      0.56    430965
         rap       0.83      0.79      0.81    298959
          rb       0.24      0.35      0.29     47343
        rock       0.40      0.62      0.48    191812

    accuracy                           0.59    994556
   macro avg       0.49      0.49      0.48    994556
weighted avg       0.63      0.59      0.60    994556



In [9]:
#tfidf
tfidf_report = nb_classification(df['lyrics'], df['tag'], vectorizer='tfidf')
print(tfidf_report)

              precision    recall  f1-score   support

     country       0.00      0.00      0.00     25477
         pop       0.58      0.91      0.71    430965
         rap       0.80      0.83      0.82    298959
          rb       0.13      0.00      0.00     47343
        rock       0.69      0.05      0.09    191812

    accuracy                           0.65    994556
   macro avg       0.44      0.36      0.32    994556
weighted avg       0.63      0.65      0.57    994556



In [None]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

In [36]:
# Logistic regression
lr_test_report, lr_val_report = lr_classification(df['lyrics'], df['tag'], n_top_features=20)
print(lr_test_report, lr_val_report)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
country, whiskey, old, beer, truck, town, id, well, lonesome, cowboy, bar, memory, tennessee, little, ol, texas, wind, kiss, aint, arms

Class: pop
repeat, pre, fcking, cos, fck, spoken, dey, yo, fk, well, fking, flesh, club, endless, noone, sea, burning, rules, suddenly, niggas

Class: rap
rap, hook, li, bro, rhymes, gang, rappers, bitch, tryna, bars, rapper, lil, homie, rapping, mic, lyrics, ayy, dude, yo, flow

Class: rb
tryna, hook, ima, nigga, imma, pre, vibe, bout, shit, x2, shawty, ain, niggas, yea, feelings, mmm, bae, babe, baby, yo

Class: rock
thе, fucking, well, band, sick, death, teeth, dead, void, guitar, punk, blood, drag, goddamn, endless, disease, everyone, machine, anyway, mouth
              precision    recall  f1-score   support

     country       0.52      0.11      0.18     12677
         pop       0.62      0.85      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.47      0.08      0.14     23802
        

In [30]:
# Logistic regression with random oversampling
lr_test_report, lr_val_report = lr_classification(df['lyrics'], df['tag'], oversample=True)
print(lr_test_report, lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

     country       0.15      0.69      0.24     12677
         pop       0.69      0.32      0.44    215359
         rap       0.89      0.79      0.83    149486
          rb       0.19      0.59      0.28     23802
        rock       0.44      0.59      0.50     95954

    accuracy                           0.54    497278
   macro avg       0.47      0.60      0.46    497278
weighted avg       0.67      0.54      0.56    497278
               precision    recall  f1-score   support

     country       0.15      0.70      0.25     12678
         pop       0.69      0.32      0.44    215358
         rap       0.89      0.79      0.83    149487
          rb       0.19      0.59      0.28     23802
        rock       0.44      0.59      0.50     95953

    accuracy                           0.54    497278
   macro avg       0.47      0.60      0.46    497278
weighted avg       0.67      0.54      0.56    497278



### Stemmed

In [10]:
df = pd.read_pickle('./pkl_files/stemmed.pkl')

In [5]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,stemmed_lyrics
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548,"[yeah, hah, yeah, rocafella, invit, somethin, ..."
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574,"[mayb, caus, im, eatin, bastard, fiend, grub, ..."
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760,"[ugh, killa, babi, kany, 1970s, heron, flow, h..."
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432,"[ask, young, boy, gon, second, time, around, g..."


In [11]:
#bow
bow_report = nb_classification(df['stemmed_lyrics'], df['tag'], vectorizer='bow')
print(bow_report)


              precision    recall  f1-score   support

     country       0.32      0.18      0.23     25477
         pop       0.65      0.48      0.55    430965
         rap       0.83      0.79      0.81    298959
          rb       0.24      0.35      0.28     47343
        rock       0.39      0.63      0.48    191812

    accuracy                           0.59    994556
   macro avg       0.48      0.49      0.47    994556
weighted avg       0.63      0.59      0.59    994556



In [12]:
#tfidf
tfidf_report = nb_classification(df['stemmed_lyrics'], df['tag'], vectorizer='tfidf')
print(tfidf_report)


              precision    recall  f1-score   support

     country       0.00      0.00      0.00     25477
         pop       0.58      0.91      0.71    430965
         rap       0.80      0.83      0.81    298959
          rb       0.14      0.00      0.00     47343
        rock       0.70      0.05      0.09    191812

    accuracy                           0.65    994556
   macro avg       0.44      0.36      0.32    994556
weighted avg       0.63      0.65      0.57    994556



In [None]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

In [12]:
# Logistic regression
lr_test_report, lr_val_report = lr_classification(df['stemmed_lyrics'], df['tag'])
print(lr_test_report, lr_val_report)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 30 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =     10267960     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.73491D+06    |proj g|=  5.40879D+05


 This problem is unconstrained.



At iterate    1    f=  3.16061D+06    |proj g|=  1.91143D+05

At iterate    2    f=  2.97408D+06    |proj g|=  6.89959D+04

At iterate    3    f=  2.90986D+06    |proj g|=  2.44654D+04

At iterate    4    f=  2.83905D+06    |proj g|=  8.48063D+04

At iterate    5    f=  2.75774D+06    |proj g|=  1.08713D+05

At iterate    6    f=  2.67352D+06    |proj g|=  1.10293D+05

At iterate    7    f=  2.60150D+06    |proj g|=  7.62782D+04

At iterate    8    f=  2.50187D+06    |proj g|=  6.80727D+04

At iterate    9    f=  2.43458D+06    |proj g|=  5.65368D+04

At iterate   10    f=  2.33964D+06    |proj g|=  5.32987D+04

At iterate   11    f=  2.22401D+06    |proj g|=  3.68666D+04

At iterate   12    f=  2.20837D+06    |proj g|=  2.91728D+04

At iterate   13    f=  2.19205D+06    |proj g|=  1.17427D+04

At iterate   14    f=  2.17896D+06    |proj g|=  2.63887D+04

At iterate   15    f=  2.16257D+06    |proj g|=  6.13592D+04

At iterate   16    f=  2.15412D+06    |proj g|=  1.06479D+05

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  9.7min finished


              precision    recall  f1-score   support

     country       0.51      0.12      0.19     12677
         pop       0.62      0.86      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.45      0.09      0.15     23802
        rock       0.58      0.25      0.35     95954

    accuracy                           0.68    497278
   macro avg       0.60      0.43      0.45    497278
weighted avg       0.67      0.68      0.65    497278
               precision    recall  f1-score   support

     country       0.50      0.12      0.19     12678
         pop       0.62      0.86      0.72    215358
         rap       0.86      0.85      0.85    149487
          rb       0.44      0.08      0.14     23802
        rock       0.58      0.25      0.35     95953

    accuracy                           0.68    497278
   macro avg       0.60      0.43      0.45    497278
weighted avg       0.67      0.68      0.65    497278



### Lemmatized

In [13]:
df = pd.read_pickle('./pkl_files/lemmatized.pkl')

In [5]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,lemmatized_lyrics
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548,"[yeah, hah, yeah, rocafella, invite, somethin,..."
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574,"[maybe, cause, im, eatin, bastard, fiend, grub..."
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760,"[ugh, killa, baby, kanye, 1970s, heron, flow, ..."
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432,"[ask, young, boy, gon, second, time, around, g..."


In [10]:
#bow
bow_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='bow')
print(bow_report)


              precision    recall  f1-score   support

     country       0.33      0.17      0.23     25477
         pop       0.65      0.49      0.56    430965
         rap       0.83      0.79      0.81    298959
          rb       0.24      0.34      0.28     47343
        rock       0.40      0.62      0.48    191812

    accuracy                           0.59    994556
   macro avg       0.49      0.48      0.47    994556
weighted avg       0.63      0.59      0.60    994556



In [11]:
#tfidf
tfidf_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='tfidf')
print(tfidf_report)

              precision    recall  f1-score   support

     country       0.00      0.00      0.00     25477
         pop       0.58      0.91      0.71    430965
         rap       0.80      0.83      0.82    298959
          rb       0.13      0.00      0.00     47343
        rock       0.70      0.05      0.09    191812

    accuracy                           0.65    994556
   macro avg       0.44      0.36      0.32    994556
weighted avg       0.63      0.65      0.57    994556



In [None]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

In [14]:
# Logistic regression
lr_test_report, lr_val_report = lr_classification(df['lemmatized_lyrics'], df['tag'])
print(lr_test_report, lr_val_report)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 30 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =     11392120     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.73491D+06    |proj g|=  5.40879D+05


 This problem is unconstrained.



At iterate    1    f=  3.16197D+06    |proj g|=  1.91447D+05

At iterate    2    f=  2.97711D+06    |proj g|=  6.92459D+04

At iterate    3    f=  2.91609D+06    |proj g|=  2.40106D+04

At iterate    4    f=  2.85199D+06    |proj g|=  8.25182D+04

At iterate    5    f=  2.77838D+06    |proj g|=  1.06562D+05

At iterate    6    f=  2.69420D+06    |proj g|=  9.53209D+04

At iterate    7    f=  2.62222D+06    |proj g|=  7.14326D+04

At iterate    8    f=  2.52159D+06    |proj g|=  5.72221D+04

At iterate    9    f=  2.43793D+06    |proj g|=  5.66389D+04

At iterate   10    f=  2.28859D+06    |proj g|=  4.63428D+04

At iterate   11    f=  2.21480D+06    |proj g|=  2.80145D+04

At iterate   12    f=  2.20159D+06    |proj g|=  3.86000D+04

At iterate   13    f=  2.18470D+06    |proj g|=  1.63934D+04

At iterate   14    f=  2.16593D+06    |proj g|=  3.52005D+04

At iterate   15    f=  2.14154D+06    |proj g|=  4.92013D+04

At iterate   16    f=  2.13477D+06    |proj g|=  8.09980D+04

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  9.6min finished


              precision    recall  f1-score   support

     country       0.51      0.13      0.21     12677
         pop       0.62      0.85      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.46      0.09      0.15     23802
        rock       0.57      0.27      0.37     95954

    accuracy                           0.68    497278
   macro avg       0.61      0.44      0.46    497278
weighted avg       0.67      0.68      0.65    497278
               precision    recall  f1-score   support

     country       0.49      0.13      0.21     12678
         pop       0.62      0.85      0.72    215358
         rap       0.86      0.85      0.85    149487
          rb       0.45      0.09      0.15     23802
        rock       0.57      0.27      0.36     95953

    accuracy                           0.68    497278
   macro avg       0.60      0.44      0.46    497278
weighted avg       0.67      0.68      0.65    497278

