In [2]:
! pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [3]:
pip install -U imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import fasttext
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

import gc

In [5]:
def nb_classification(train, label, vectorizer='bow', n_top_features=10, oversample=False):

    train = train.apply(' '.join)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=42)

    if vectorizer == 'bow':
        vec = CountVectorizer()
    elif vectorizer == 'tfidf':
        vec = TfidfVectorizer()

    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)

    # Initialize the MultinomialNB
    nb = MultinomialNB()

    # Perform random oversampling if enabled
    if oversample:
        oversampler = RandomOverSampler(random_state=42)
        X_train_vec, y_train = oversampler.fit_resample(X_train_vec, y_train)

    # Fit the model to the training data
    nb.fit(X_train_vec, y_train)

    # Print the most informative features
    feature_names = vec.get_feature_names_out()
    for i, class_label in enumerate(nb.classes_):
        print(f"\nClass: {class_label}")
        top_features_idx = nb.feature_log_prob_[i].argsort()[-n_top_features:][::-1]
        top_features = [feature_names[idx] for idx in top_features_idx]
        print(", ".join(top_features))

    # Predict the labels of the test set
    y_pred = nb.predict(X_test_vec)

    # Generate classification report
    report = classification_report(y_test, y_pred)
    return report


In [6]:
def lr_classification(train, label, vectorizer_method='tfidf', oversample=False, n_top_features=10):

    train = train.apply(lambda x: ' '.join(x))
    
    # split into train and test sets, with stratifying
    X_train, X_temp, y_train, y_temp = train_test_split(train, label, test_size=0.3, random_state=42, stratify=label)

    # Split the temporary test set into 50% test and 50% validation (15% of the total data each)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Initialize vectorizer (TDIDF OR bag of words)
    if vectorizer_method == 'tdidf':
        vectorizer = TfidfVectorizer()
    elif vectorizer_method == 'bow':
        vectorizer = CountVectorizer()
    
    # Vectorize data
    X_train_vectors = vectorizer.fit_transform(X_train)
    X_val_vectors = vectorizer.transform(X_val)
    X_test_vectors = vectorizer.transform(X_test)

    # Perform oversampling
    if oversample == True:
        oversampler = RandomOverSampler(random_state=42)
        X_train_vectors, y_train = oversampler.fit_resample(X_train_vectors, y_train)

    # Initialize the Multinomial LR
    lr_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', n_jobs=-1)

    # Fit the model to the training data 
    lr_clf.fit(X_train_vectors, y_train)
  
    # Get the feature names from the vectorizer
    feature_names = vectorizer.get_feature_names_out()
  
    # Print the most informative features
    for i, label in enumerate(lr_clf.classes_):
        print(f"\nClass: {label}")
        top_features_idx = lr_clf.coef_[i].argsort()[-n_top_features:][::-1]
        top_features = [feature_names[idx] for idx in top_features_idx]
        print(", ".join(top_features))

    # Predict the labels of the test set
    y_test_pred = lr_clf.predict(X_test_vectors)
    y_val_pred = lr_clf.predict(X_val_vectors)

    # Generate classification report
    test_report = classification_report(y_test, y_test_pred)
    val_report = classification_report(y_val, y_val_pred)

    return test_report, val_report


### Tokenized

In [8]:
df = pd.read_pickle('./pkl_files/tokenized.pkl')

In [9]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432


In [35]:
%%time
# Naive Bayes & Bag of Words
bow_nb_report = nb_classification(df['lyrics'], df['tag'], vectorizer='bow')
print(bow_nb_report)


Class: country
im, love, like, dont, know, got, na, oh, one, time

Class: pop
im, love, know, dont, like, oh, na, got, go, time

Class: rap
im, like, got, know, get, yeah, dont, shit, aint, na

Class: rb
love, im, know, yeah, dont, like, got, baby, na, oh

Class: rock
im, know, dont, like, time, love, na, never, oh, got
              precision    recall  f1-score   support

     country       0.33      0.18      0.23     25477
         pop       0.65      0.49      0.56    430965
         rap       0.83      0.79      0.81    298959
          rb       0.24      0.35      0.29     47343
        rock       0.40      0.62      0.48    191812

    accuracy                           0.59    994556
   macro avg       0.49      0.49      0.48    994556
weighted avg       0.63      0.59      0.60    994556

CPU times: user 5min 58s, sys: 19.5 s, total: 6min 17s
Wall time: 6min 17s


In [37]:
%%time
# Naive Bayes & TFIDF
tfidf_nb_report = nb_classification(df['lyrics'], df['tag'], vectorizer='tfidf')
print(tfidf_nb_report)


Class: country
love, im, dont, know, like, oh, got, na, one, youre

Class: pop
love, im, dont, know, oh, na, like, youre, time, go

Class: rap
im, like, got, yeah, get, dont, shit, bitch, know, nigga

Class: rb
love, baby, yeah, know, im, oh, dont, na, got, like

Class: rock
im, dont, know, time, love, youre, never, oh, like, see
              precision    recall  f1-score   support

     country       0.00      0.00      0.00     25477
         pop       0.58      0.91      0.71    430965
         rap       0.80      0.83      0.82    298959
          rb       0.13      0.00      0.00     47343
        rock       0.69      0.05      0.09    191812

    accuracy                           0.65    994556
   macro avg       0.44      0.36      0.32    994556
weighted avg       0.63      0.65      0.57    994556

CPU times: user 6min 3s, sys: 20.6 s, total: 6min 24s
Wall time: 6min 24s


In [49]:
%%time
# Random Oversampling with Naive Bayes & Bag of Words
ros_bow_nb_report = nb_classification(df['lyrics'], df['tag'], vectorizer='bow', oversample=True)
print(ros_bow_nb_report)

(5023785, 2303387)

Class: country
im, love, like, dont, know, got, na, oh, one, time

Class: pop
im, love, know, dont, like, oh, na, got, go, time

Class: rap
im, like, got, know, get, yeah, dont, shit, aint, na

Class: rb
love, know, im, yeah, dont, got, like, baby, na, oh

Class: rock
im, know, dont, like, time, love, na, never, oh, got
              precision    recall  f1-score   support

     country       0.14      0.67      0.23     25477
         pop       0.70      0.18      0.29    430965
         rap       0.86      0.75      0.80    298959
          rb       0.17      0.59      0.26     47343
        rock       0.38      0.65      0.48    191812

    accuracy                           0.48    994556
   macro avg       0.45      0.57      0.41    994556
weighted avg       0.65      0.48      0.48    994556

CPU times: user 6min 18s, sys: 24.7 s, total: 6min 43s
Wall time: 6min 43s


In [None]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

In [26]:
%%time
# Logistic regression & Bag of Words
bow_lr_test_report, bow_lr_val_report = lr_classification(df['lyrics'], df['tag'],
                                                    vectorizer_method='bow',
                                                    n_top_features=20)
print(bow_lr_test_report, bow_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
lonesome, truck, porch, whiskey, tennessee, guitar, texas, memory, cowboy, folks, tailgate, hed, heartaches, heartache, bar, instrumental, headed, darlin, creek, boots

Class: pop
rap, fk, fcking, fck, chuckle, pre, niggas, spoken, fking, endless, nigga, evry, sung, mcs, cos, disease, fkin, refrain, mic, noone

Class: rap
rapping, snippet, lyrics, rapper, rappers, rap, intro, raps, fam, rhymes, bars, spitting, hook, depression, opps, pen, bro, niggas, bruh, booth

Class: rb
pre, tryna, outro, trynna, niggas, 2x, hook, imma, nigga, shawty, intro, focused, bitches, crib, hella, finna, henny, vibe, x2, stress

Class: rock
thе, endless, fz, disease, punk, guitar, teeth, failure, decay, tongues, filth, collapse, drag, fucking, despair, apathy, destroy, desperate, crawling, noose
              precision    recall  f1-score   support

     country       0.47      0.12      0.19     12677
         pop       0.61      0.86      0.71    215359
         rap       0.87      0.82   

In [27]:
%%time
# Logistic regression & TDIDF
tfidf_lr_test_report, tfidf_lr_val_report = lr_classification(df['lyrics'], df['tag'],
                                                    vectorizer_method='tdidf',
                                                    n_top_features=20)
print(tfidf_lr_test_report, tfidf_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
country, whiskey, old, beer, truck, town, id, well, lonesome, cowboy, bar, memory, tennessee, little, ol, texas, wind, kiss, aint, arms

Class: pop
repeat, pre, fcking, cos, fck, spoken, dey, yo, fk, well, fking, flesh, club, endless, noone, sea, burning, rules, suddenly, niggas

Class: rap
rap, hook, li, bro, rhymes, gang, rappers, bitch, tryna, bars, rapper, lil, homie, rapping, mic, lyrics, ayy, dude, yo, flow

Class: rb
tryna, hook, ima, nigga, imma, pre, vibe, bout, shit, x2, shawty, ain, niggas, yea, feelings, mmm, bae, babe, baby, yo

Class: rock
thе, fucking, well, band, sick, death, teeth, dead, void, guitar, punk, blood, drag, goddamn, endless, disease, everyone, machine, anyway, mouth
              precision    recall  f1-score   support

     country       0.52      0.11      0.18     12677
         pop       0.62      0.85      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.47      0.08      0.14     23802
        

In [28]:
%%time
# Random Oversampling with Logistic regression & TDIDF
ros_tfidf_lr_test_report, ros_tfidf_lr_val_report = lr_classification(df['lyrics'], df['tag'],
                                                    oversample=True,
                                                    vectorizer_method='tdidf',
                                                    n_top_features=20)
print(ros_tfidf_lr_test_report, ros_tfidf_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
truck, whiskey, country, beer, cowboy, bar, ol, tennessee, old, boots, porch, heartaches, texas, instrumental, lonesome, tailgate, county, guitar, honky, town

Class: pop
cos, repeat, spoken, yo, pre, bitch, fcking, endless, club, noone, flesh, fck, dey, nae, machine, sea, youve, tv, colour, ga

Class: rap
rap, tryna, hook, nigga, niggas, rappers, rhymes, li, shit, yo, mic, bro, imma, flow, homie, rapper, bars, lyrics, rapping, lil

Class: rb
tryna, hook, pre, niggas, imma, nigga, ima, vibe, yo, shit, outro, shawty, funky, bae, energy, yall, x2, bout, type, funk

Class: rock
disease, fucking, band, void, teeth, punk, well, drag, destroy, sick, machine, decay, scream, crawling, metal, endless, thе, goddamn, guitar, lungs
              precision    recall  f1-score   support

     country       0.15      0.69      0.24     12677
         pop       0.69      0.32      0.44    215359
         rap       0.89      0.79      0.83    149486
          rb       0.19      0.59    

### Stemmed

In [6]:
df = pd.read_pickle('./pkl_files/stemmed.pkl')

In [7]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,stemmed_lyrics
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548,"[yeah, hah, yeah, rocafella, invit, somethin, ..."
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574,"[mayb, caus, im, eatin, bastard, fiend, grub, ..."
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760,"[ugh, killa, babi, kany, 1970s, heron, flow, h..."
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432,"[ask, young, boy, gon, second, time, around, g..."


In [53]:
%%time
# Naive Bayes & Bag of Words
bow_nb_report = nb_classification(df['stemmed_lyrics'], df['tag'], vectorizer='bow')
print(bow_nb_report)



Class: country
im, love, like, know, dont, got, na, go, get, time

Class: pop
im, love, know, dont, like, oh, na, go, got, get

Class: rap
im, like, got, get, know, nigga, yeah, dont, fuck, bitch

Class: rb
love, know, im, yeah, dont, like, got, babi, na, oh

Class: rock
im, know, dont, like, love, time, go, one, come, feel
              precision    recall  f1-score   support

     country       0.32      0.18      0.23     25477
         pop       0.65      0.48      0.55    430965
         rap       0.83      0.79      0.81    298959
          rb       0.24      0.35      0.28     47343
        rock       0.39      0.63      0.48    191812

    accuracy                           0.59    994556
   macro avg       0.48      0.49      0.47    994556
weighted avg       0.63      0.59      0.59    994556

CPU times: user 5min 28s, sys: 16.2 s, total: 5min 44s
Wall time: 5min 44s


In [54]:
%%time
# Naive Bayes & TFIDF
tfidf_nb_report = nb_classification(df['stemmed_lyrics'], df['tag'], vectorizer='tfidf')
print(tfidf_nb_report)



Class: country
love, im, know, dont, like, oh, time, got, go, one

Class: pop
love, im, know, dont, oh, like, na, go, your, time

Class: rap
im, nigga, like, got, fuck, bitch, yeah, get, shit, dont

Class: rb
love, babi, yeah, know, im, dont, oh, na, got, like

Class: rock
im, know, dont, time, love, feel, your, never, go, one
              precision    recall  f1-score   support

     country       0.00      0.00      0.00     25477
         pop       0.58      0.91      0.71    430965
         rap       0.80      0.83      0.81    298959
          rb       0.14      0.00      0.00     47343
        rock       0.70      0.05      0.09    191812

    accuracy                           0.65    994556
   macro avg       0.44      0.36      0.32    994556
weighted avg       0.63      0.65      0.57    994556

CPU times: user 5min 38s, sys: 22.1 s, total: 6min
Wall time: 6min


In [57]:
%%time
# Random Oversampling with Naive Bayes & Bag of Words
ros_bow_nb_report = nb_classification(df['stemmed_lyrics'], df['tag'], vectorizer='bow', oversample=True)
print(ros_bow_nb_report)


Class: country
im, love, like, know, dont, got, na, go, get, time

Class: pop
im, love, know, dont, like, oh, na, go, got, get

Class: rap
im, like, got, get, know, nigga, yeah, dont, fuck, bitch

Class: rb
love, know, im, yeah, dont, like, got, babi, na, oh

Class: rock
im, know, dont, like, time, love, go, one, come, feel
              precision    recall  f1-score   support

     country       0.13      0.67      0.22     25477
         pop       0.70      0.17      0.27    430965
         rap       0.85      0.75      0.80    298959
          rb       0.17      0.58      0.26     47343
        rock       0.38      0.65      0.48    191812

    accuracy                           0.47    994556
   macro avg       0.45      0.57      0.41    994556
weighted avg       0.65      0.47      0.47    994556

CPU times: user 6min 1s, sys: 25.4 s, total: 6min 26s
Wall time: 6min 26s


In [None]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

In [59]:
%%time
# Logistic regression & Bag of Words
bow_lr_test_report, bow_lr_val_report = lr_classification(df['stemmed_lyrics'], df['tag'],
                                                    vectorizer_method='bow',
                                                    n_top_features=20)
print(bow_lr_test_report, bow_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
lonesom, porch, whiskey, guitar, heartach, truck, tennesse, texa, hed, tailgat, cowboy, creek, bar, instrument, folk, darlin, outlaw, nashvill, counti, pine

Class: pop
fk, fcking, fck, chuckl, pre, rap, spoken, nigga, fking, evri, endless, mcs, sung, cos, refrain, fkin, sigh, diseas, total, feat

Class: rap
snippet, rapper, rap, lyric, intro, fam, opp, bro, diss, rhyme, xan, pen, mixtap, bruh, depress, tryna, shoutout, blunt, booth, glock

Class: rb
pre, outro, tryna, trynna, 2x, nigga, imma, reminisc, intro, shawti, stress, hook, homi, crib, x2, hella, finna, text, henni, mmmm

Class: rock
thе, endless, fz, diseas, punk, guitar, collaps, drag, failur, crawl, teeth, desper, total, destroy, apathi, filth, decay, despair, goddamn, wither
              precision    recall  f1-score   support

     country       0.47      0.11      0.17     12677
         pop       0.61      0.87      0.71    215359
         rap       0.87      0.82      0.84    149486
          rb       0

In [60]:
%%time
# Logistic regression & TDIDF
tfidf_lr_test_report, tfidf_lr_val_report = lr_classification(df['stemmed_lyrics'], df['tag'],
                                                    vectorizer_method='tdidf',
                                                    n_top_features=20)
print(tfidf_lr_test_report, tfidf_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
countri, whiskey, old, truck, beer, town, cowboy, id, well, lonesom, heartach, littl, tennesse, kiss, there, aint, ol, wind, memori, texa

Class: pop
nigga, cos, repeat, pre, fcking, yo, fck, spoken, dey, bitch, flesh, club, fk, machin, colour, endless, rule, fking, hai, sea

Class: rap
rap, rapper, rhyme, li, homi, bro, gang, lyric, lil, hook, fuck, ayi, nigga, tryna, dude, bitch, mic, stress, intro, fam

Class: rb
tryna, hook, nigga, ima, vibe, bout, imma, ain, shit, shawti, pre, x2, babi, mmm, yea, bodi, bae, babe, yo, aye

Class: rock
thе, fuck, well, drag, punk, guitar, sick, goddamn, dead, blood, teeth, void, death, crawl, mouth, scream, machin, endless, diseas, kill
              precision    recall  f1-score   support

     country       0.51      0.12      0.19     12677
         pop       0.62      0.86      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.45      0.09      0.15     23802
        rock       0.58      0.

In [8]:
%%time
# Random Oversampling with Logistic regression & TDIDF
ros_tfidf_lr_test_report, ros_tfidf_lr_val_report = lr_classification(df['stemmed_lyrics'], df['tag'],
                                                    vectorizer_method='tdidf',
                                                    oversample=True,
                                                    n_top_features=20)
print(ros_tfidf_lr_test_report, ros_tfidf_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
truck, whiskey, beer, cowboy, countri, bar, ol, porch, tennesse, instrument, boot, texa, memri, tailgat, guitar, nashvill, counti, creek, heartach, redneck

Class: pop
nigga, cos, repeat, bitch, spoken, yo, colour, club, pre, fcking, machin, bore, fck, endless, innoc, flesh, dey, noon, nae, sigh

Class: rap
rap, rapper, tryna, rhyme, nigga, hook, li, homi, lyric, imma, mic, bro, yo, lil, shit, spit, ima, bitch, dude, ayi

Class: rb
tryna, nigga, hook, pre, imma, ima, vibe, yo, outro, stress, funki, shawti, yall, energi, bae, focus, funk, type, shit, groov

Class: rock
punk, drag, diseas, fuck, guitar, teeth, void, bore, thе, goddamn, crawl, failur, destroy, decay, machin, choke, collaps, band, sick, metal
              precision    recall  f1-score   support

     country       0.14      0.70      0.24     12677
         pop       0.69      0.32      0.43    215359
         rap       0.89      0.79      0.83    149486
          rb       0.18      0.60      0.28     2380

### Lemmatized

In [7]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [8]:
df = pd.read_pickle('./pkl_files/lemmatized.pkl')

In [9]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,lemmatized_lyrics
0,Killa Cam,rap,Cam'ron,2004,"[killa, cam, killa, cam, cam, killa, cam, kill...",1,762,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,rap,JAY-Z,1996,"[yeah, hah, yeah, rocafella, invite, somethin,...",3,548,"[yeah, hah, yeah, rocafella, invite, somethin,..."
2,Forgive Me Father,rap,Fabolous,2003,"[maybe, cause, im, eatin, bastards, fiend, gru...",4,574,"[maybe, cause, im, eatin, bastard, fiend, grub..."
3,Down and Out,rap,Cam'ron,2004,"[ugh, killa, baby, kanye, 1970s, heron, flow, ...",5,760,"[ugh, killa, baby, kanye, 1970s, heron, flow, ..."
4,Fly In,rap,Lil Wayne,2005,"[ask, young, boy, gon, second, time, around, g...",6,432,"[ask, young, boy, gon, second, time, around, g..."


In [10]:
%%time
# Naive Bayes & Bag of Words
bow_nb_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='bow')
print(bow_nb_report)



Class: country
im, love, like, know, dont, got, na, time, go, one

Class: pop
im, know, love, dont, like, oh, na, got, go, time

Class: rap
im, like, got, know, nigga, get, yeah, dont, bitch, shit

Class: rb
love, know, im, yeah, dont, like, got, baby, na, oh

Class: rock
im, know, dont, like, time, love, one, go, na, never
              precision    recall  f1-score   support

     country       0.33      0.17      0.23     25477
         pop       0.65      0.49      0.56    430965
         rap       0.83      0.79      0.81    298959
          rb       0.24      0.34      0.28     47343
        rock       0.40      0.62      0.48    191812

    accuracy                           0.59    994556
   macro avg       0.49      0.48      0.47    994556
weighted avg       0.63      0.59      0.60    994556

CPU times: user 5min 37s, sys: 23.6 s, total: 6min 1s
Wall time: 6min


In [11]:
%%time
# Naive Bayes & TFIDF
tfidf_nb_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='tfidf')
print(tfidf_nb_report)


Class: country
love, im, know, dont, like, oh, time, got, one, na

Class: pop
love, im, know, dont, oh, like, na, youre, time, go

Class: rap
im, nigga, like, got, bitch, yeah, get, shit, dont, know

Class: rb
love, baby, yeah, know, im, dont, oh, na, got, like

Class: rock
im, know, dont, time, love, youre, never, one, oh, like
              precision    recall  f1-score   support

     country       0.00      0.00      0.00     25477
         pop       0.58      0.91      0.71    430965
         rap       0.80      0.83      0.82    298959
          rb       0.13      0.00      0.00     47343
        rock       0.70      0.05      0.09    191812

    accuracy                           0.65    994556
   macro avg       0.44      0.36      0.32    994556
weighted avg       0.63      0.65      0.57    994556

CPU times: user 5min 42s, sys: 32.5 s, total: 6min 14s
Wall time: 6min 14s


In [12]:
%%time
# Random Oversampling with Naive Bayes & Bag of Words
ros_bow_nb_report = nb_classification(df['lemmatized_lyrics'], df['tag'], vectorizer='bow', oversample=True)
print(ros_bow_nb_report)


Class: country
im, love, like, know, dont, got, na, time, go, one

Class: pop
im, know, love, dont, like, oh, na, got, go, time

Class: rap
im, like, got, know, nigga, get, yeah, dont, bitch, shit

Class: rb
love, know, im, yeah, dont, like, got, baby, na, oh

Class: rock
im, know, dont, like, time, love, one, go, na, never
              precision    recall  f1-score   support

     country       0.13      0.67      0.22     25477
         pop       0.70      0.18      0.28    430965
         rap       0.86      0.75      0.80    298959
          rb       0.17      0.59      0.26     47343
        rock       0.38      0.65      0.48    191812

    accuracy                           0.47    994556
   macro avg       0.45      0.57      0.41    994556
weighted avg       0.65      0.47      0.47    994556

CPU times: user 6min 1s, sys: 32.6 s, total: 6min 34s
Wall time: 6min 34s


In [13]:
# Collect RAM from garbage to prevent kernel from dying
gc.collect()

0

In [14]:
%%time
# Logistic regression & Bag of Words
bow_lr_test_report, bow_lr_val_report = lr_classification(df['lemmatized_lyrics'], df['tag'],
                                                    vectorizer_method='bow',
                                                    n_top_features=20)
print(bow_lr_test_report, bow_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
lonesome, porch, whiskey, guitar, heartache, truck, tennessee, texas, tailgate, hed, instrumental, cowboy, creek, headed, folk, outlaw, nashville, darlin, bar, jukebox

Class: pop
fcking, fk, rap, chuckle, fck, pre, fking, spoken, nigga, evry, endless, mc, sung, fkin, disease, refrain, co, feat, noone, yourll

Class: rap
snippet, rapping, rapper, lyric, rap, intro, fam, spitting, depression, opps, hook, booth, bro, bruh, pen, glock, shoutout, tryna, rhyme, mixtape

Class: rb
pre, outro, tryna, trynna, 2x, hook, nigga, imma, intro, shawty, focused, crib, finna, hella, henny, stress, hoe, mmmm, x2, tho

Class: rock
thе, endless, fz, disease, guitar, punk, failure, decay, drag, collapse, teeth, filth, apathy, despair, fucking, destroy, desperate, noose, crawling, dragged
              precision    recall  f1-score   support

     country       0.47      0.11      0.18     12677
         pop       0.61      0.87      0.71    215359
         rap       0.87      0.82      0.8

In [15]:
%%time
# Logistic regression & TDIDF
tfidf_lr_test_report, tfidf_lr_val_report = lr_classification(df['lemmatized_lyrics'], df['tag'],
                                                    vectorizer_method='tdidf',
                                                    n_top_features=20)
print(tfidf_lr_test_report, tfidf_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
whiskey, country, truck, beer, cowboy, old, town, id, heartache, lonesome, tennessee, wind, ol, well, bar, texas, kiss, memory, boot, guitar

Class: pop
nigga, fcking, repeat, pre, fck, yo, co, dey, spoken, fk, club, machine, flesh, fking, well, endless, hai, noone, tv, colour

Class: rap
rap, rapper, rhyme, nigga, li, hook, bro, gang, lil, bitch, tryna, homie, rapping, dude, lyric, mic, ayy, intro, fam, flow

Class: rb
tryna, hook, ima, nigga, imma, pre, vibe, shit, x2, shawty, bout, ain, mmm, yea, yo, bae, joy, aye, body, energy

Class: rock
thе, fucking, well, punk, guitar, sick, drag, goddamn, death, blood, teeth, dead, machine, void, endless, disease, mouth, anyway, everyone, scream
              precision    recall  f1-score   support

     country       0.51      0.13      0.21     12677
         pop       0.62      0.85      0.72    215359
         rap       0.86      0.85      0.85    149486
          rb       0.46      0.09      0.15     23802
        rock    

In [16]:
%%time
# Random Oversampling with Logistic regression & TDIDF
ros_tfidf_lr_test_report, ros_tfidf_lr_val_report = lr_classification(df['lyrics'], df['tag'],
                                                    oversample=True,
                                                    vectorizer_method='tdidf',
                                                    n_top_features=20)
print(ros_tfidf_lr_test_report, ros_tfidf_lr_val_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Class: country
truck, whiskey, country, beer, cowboy, bar, ol, tennessee, old, boots, porch, heartaches, texas, instrumental, lonesome, tailgate, county, guitar, honky, town

Class: pop
cos, repeat, spoken, yo, pre, bitch, fcking, endless, club, noone, flesh, fck, dey, nae, machine, sea, youve, tv, colour, ga

Class: rap
rap, tryna, hook, nigga, niggas, rappers, rhymes, li, shit, yo, mic, bro, imma, flow, homie, rapper, bars, lyrics, rapping, lil

Class: rb
tryna, hook, pre, niggas, imma, nigga, ima, vibe, yo, shit, outro, shawty, funky, bae, energy, yall, x2, bout, type, funk

Class: rock
disease, fucking, band, void, teeth, punk, well, drag, destroy, sick, machine, decay, scream, crawling, metal, endless, thе, goddamn, guitar, lungs
              precision    recall  f1-score   support

     country       0.15      0.69      0.24     12677
         pop       0.69      0.32      0.44    215359
         rap       0.89      0.79      0.83    149486
          rb       0.19      0.59    