In [86]:
#ok lets do this haha

## Part 1: Preliminaries

In [87]:
#import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk as nltk
from nltk.corpus import stopwords
import string

## Part 2: Reading a text-based dataset into pandas

In [88]:
# Load the CSV file
data = pd.read_csv('processed_lyrics.csv')

In [89]:
# Extract the column names
column_names = data.columns.tolist()

# Print the column names
print("Column Names:", column_names)

Column Names: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language', 'lyric_length']


In [90]:
data.tag.value_counts()

tag
pop        4971
rap        1275
rock        678
rb          273
misc         67
country      28
Name: count, dtype: int64

In [91]:
data['label_num'] = data.tag.map({'country':0, 'misc':1, 'pop':2, 'rap':3, 'rb':4, 'rock':5})
data.drop('views', axis=1, inplace=True)
data.drop('language_cld3', axis=1, inplace=True)
data.drop('language_ft', axis=1, inplace=True)

In [92]:
data.head(10)

Unnamed: 0,title,tag,artist,year,features,lyrics,id,language,lyric_length,label_num
0,All star,rap,Enchant boyz,2012,{Lil_john},nang unang bibira mabomba bara mag ihaw gamit ...,64812,fil,112,3
1,Balak ni Syke,rap,Gloc-9,2012,{},alak balak lasing kasalukuyan ngunit malaman a...,85710,fil,158,3
2,Apatnapungbara,rap,Gloc-9,2012,"{""Ian Tayao""}",hook ian tayao akoy tutula mahaba nako umupo p...,85711,fil,205,3
3,Silup,rap,Gloc-9,2012,"{""Denise Barcena""}",hook denise mamang pulis pwede ba akong huming...,85713,fil,221,3
4,By Repablikan Syndicate Siobal D,rap,Pagbigyan,2012,{Military-g},pagbiyan mo puso umibig katulafd mo limutin yo...,88704,fil,29,3
5,New Life Song Part Two,rap,R1 one 6 souljhaz,2012,{},choros dios buhay ikay pasasalamtan awitin nil...,102239,fil,40,3
6,New Life Song Part Two,rap,R.1 One Six Souljhaz,2012,{Van.rey},choros dios buhay ikay pasasalamtan awitin nil...,102242,fil,40,3
7,Luha,rap,Repabablikan productions,2013,{Repablikan},lyrics of luha – repablikan magpaparaya mahal ...,178228,fil,224,3
8,NutriJingle,rap,Kamikzee,2013,{},kinukumpleto mo araw tuwing hinahain mo pagkai...,198397,fil,45,3
9,Yeah,rock,Kamikazee,2013,{},1 kinukumpleto mo araw tuwing hinahain mo pagk...,198400,fil,47,5


In [93]:
# Assume the lyrics are in a column named 'lyrics' and the genres are in a column named 'tag'
lyrics = data['lyrics']
genre = data['tag']

In [94]:
# Extracting the vocabulary 
from collections import Counter

# Apply basic splitting to each of the lyrics
split_lyrics = data['lyrics'].apply(lambda x: x.split())

# Ensure each list contains only unique words
each_unique_words = split_lyrics.apply(lambda x: list(set(x)))

# Combine all lists into one list
all_words_list = [word for sublist in each_unique_words for word in sublist]

# Extract overall unique words list and label as vocab
vocab = unique_words = set(all_words_list)

# Number of unique words
num_elements = len(vocab)

In [95]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(lyrics, genre, test_size=0.2, stratify=genre, random_state=42)

## Part 3: Vectorizing our dataset

In [96]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
vectorizer.fit(X_train)


In [97]:
# examine the fitted vocabulary
vectorizer.get_feature_names_out()

array(['01', '02', '03', ..., '雖然我很愛你', '𝘉𝘦𝘢𝘵𝘴', '𝘊𝘩𝘢𝘸'], dtype=object)

In [98]:
# fit and transform training data into a 'document-term matrix'
X_train_dtm = vectorizer.fit_transform(X_train)

In [99]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,01,02,03,04,05,06,07,08,09,0917,...,我心中易有親愛,我沒辦法告訴你,我真的没办法,振り回されたって,新しい歌,的爱你,誰にだって,雖然我很愛你,𝘉𝘦𝘢𝘵𝘴,𝘊𝘩𝘢𝘸
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5830,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
# transform testing data into a document-term matrix (using existing vocabulary)
X_test_dtm = vectorizer.transform(X_test)
X_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [101]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_test_dtm.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,01,02,03,04,05,06,07,08,09,0917,...,我心中易有親愛,我沒辦法告訴你,我真的没办法,振り回されたって,新しい歌,的爱你,誰にだって,雖然我很愛你,𝘉𝘦𝘢𝘵𝘴,𝘊𝘩𝘢𝘸
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Part 4: Building and evaluating a model

In [102]:
# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_dtm, y_train)


In [103]:
# Make predictions on the test data
y_pred_class = nb_classifier.predict(X_test_dtm)


In [104]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7546264564770391

In [105]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[  1,   0,   4,   0,   0,   0],
       [  0,   0,  12,   0,   0,   1],
       [  1,   1, 906,  72,   5,  10],
       [  0,   0,  69, 186,   0,   0],
       [  0,   0,  37,  17,   1,   0],
       [  0,   0, 112,  16,   1,   7]], dtype=int64)

In [106]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

     country       0.50      0.20      0.29         5
        misc       0.00      0.00      0.00        13
         pop       0.79      0.91      0.85       995
         rap       0.64      0.73      0.68       255
          rb       0.14      0.02      0.03        55
        rock       0.39      0.05      0.09       136

    accuracy                           0.75      1459
   macro avg       0.41      0.32      0.32      1459
weighted avg       0.70      0.75      0.71      1459



In [107]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb_classifier.predict_proba(X_test_dtm)
y_pred_prob

array([[3.55714641e-028, 1.03047461e-019, 1.00000000e+000,
        2.60238620e-015, 4.18258848e-015, 8.33535641e-011],
       [7.65522738e-067, 1.87216478e-054, 1.00000000e+000,
        4.00563460e-012, 6.29697085e-031, 1.59373699e-015],
       [4.62173923e-089, 5.60543852e-070, 1.00000000e+000,
        4.19873173e-028, 2.71531197e-037, 8.31586079e-023],
       ...,
       [1.16877810e-070, 1.28854214e-057, 1.00000000e+000,
        8.93453430e-027, 1.05961654e-034, 1.27428459e-024],
       [9.39981892e-136, 1.09660626e-100, 1.00000000e+000,
        2.21509082e-032, 1.02068813e-038, 2.03880030e-035],
       [2.29407640e-224, 7.18466947e-204, 1.00000000e+000,
        1.88644397e-051, 4.72815781e-043, 1.36883530e-042]])

In [108]:
# calculate AUC
auc_scores = metrics.roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
auc_scores

0.7106328307495057

## Part 5: Examining a model for further insight

In [109]:
# store the vocabulary of X_train
X_train_tokens = vectorizer.get_feature_names_out()
len(X_train_tokens)

45370

In [110]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['01' '02' '03' '04' '05' '06' '07' '08' '09' '0917' '10' '100' '1008'
 '1020' '1096' '11' '11x' '12' '1234' '12gauge' '12x' '13' '13th' '13x'
 '14' '14344' '15' '15x' '16' '168' '16x' '17' '18' '1861' '187' '1896'
 '19' '1976' '1982' '1997' '1998' '1999' '1fritz' '1igalaw' '1loonie' '1s'
 '1shortone' '1st' '1x' '20']


In [111]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['zido' 'zild' 'zimzalabim' 'zio' 'zjay' 'zo' 'zoids' 'zoilo' 'zombie'
 'zone' 'zonin' 'zoo' 'zoom' 'zoomusulong' 'zoren' 'zsa' 'zuriel' 'zzp'
 '½t' '½y' 'ísipin' 'óo' 'ýo' 'ýyong' 'ʼko' 'еdad' 'еven' 'いつか誰かとまた恋に落ちても'
 'いつもあなただけの場所があるから' 'うたえるまで' 'すべてを忘れて' 'そんなことあるよ' 'そんなの無駄なこと' 'たくさんの出来事に'
 'だからって塞ぎ込む' 'キミらしくいてよ' 'ㅗㅏㅣoh' '今だけは' '今はまだ悲しい' '但只希望的愛' '我心中易有親愛'
 '我沒辦法告訴你' '我真的没办法' '振り回されたって' '新しい歌' '的爱你' '誰にだって' '雖然我很愛你' '𝘉𝘦𝘢𝘵𝘴'
 '𝘊𝘩𝘢𝘸']


In [112]:
# Naive Bayes counts the number of times each token appears in each class
nb_classifier.feature_count_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 1., 3., ..., 0., 1., 4.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [113]:
# rows represent classes, columns represent tokens
nb_classifier.feature_count_.shape

(6, 45370)

 country       0.00      0.00      0.00         4
        misc       0.00      0.00      0.00        15
         pop       0.80      0.89      0.84       997
         rap       0.59      0.70      0.64       266
          rb       0.31      0.07      0.11        58
        rock    

In [114]:
country_token_count = nb_classifier.feature_count_[0, :]
misc_token_count = nb_classifier.feature_count_[1, :]
pop_token_count = nb_classifier.feature_count_[2, :]
rap_token_count = nb_classifier.feature_count_[3, :]
rb_token_count = nb_classifier.feature_count_[4, :]
rock_token_count = nb_classifier.feature_count_[5, :]
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'country':country_token_count, 'misc':misc_token_count, 'pop':pop_token_count, 'rap':rap_token_count, 'rb':rb_token_count, 'rock':rock_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,0.0
4,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0


In [115]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=427)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sampo,0.0,0.0,0.0,1.0,0.0,0.0
lubak,0.0,0.0,2.0,3.0,1.0,0.0
umusap,0.0,0.0,0.0,1.0,0.0,0.0
waley,0.0,0.0,3.0,0.0,0.0,0.0
wooohhhh,0.0,0.0,4.0,0.0,0.0,0.0


In [116]:
# add 1 to tag counts to avoid dividing by 0 (1 point)
tokens[['country', 'misc', 'pop', 'rap', 'rb', 'rock']] = tokens[['country', 'misc', 'pop', 'rap', 'rb', 'rock']].apply(lambda x: x + 1)

In [117]:
# convert the tag counts into frequencies
tokens['country'] = tokens['country'] / nb_classifier.class_count_[0]
tokens['misc'] = tokens['misc'] / nb_classifier.class_count_[1]
tokens['pop'] = tokens['pop'] / nb_classifier.class_count_[2]
tokens['rap'] = tokens['rap'] / nb_classifier.class_count_[3]
tokens['rb'] = tokens['rb'] / nb_classifier.class_count_[4]
tokens['rock'] = tokens['rock'] / nb_classifier.class_count_[5]

tokens.sample(5, random_state=427)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sampo,0.043478,0.018519,0.000252,0.001961,0.004587,0.001845
lubak,0.043478,0.018519,0.000755,0.003922,0.009174,0.001845
umusap,0.043478,0.018519,0.000252,0.001961,0.004587,0.001845
waley,0.043478,0.018519,0.001006,0.00098,0.004587,0.001845
wooohhhh,0.043478,0.018519,0.001258,0.00098,0.004587,0.001845


In [118]:
# Assuming you have a DataFrame called tokens with columns: 'token', 'genre1', 'genre2', ..., 'genre6'

# Calculate the ratio of each genre to all other genres for each token
for genre in ['country', 'misc', 'pop', 'rap', 'rb', 'rock']:
    other_genres = [col for col in tokens.columns if col != genre]
    tokens[f'{genre}_ratio'] = tokens[genre] / tokens[other_genres].sum(axis=1)

# Sample 5 rows from the DataFrame
sampled_tokens = tokens.sample(5, random_state=427)

# Print the sampled tokens with the calculated ratios
print(sampled_tokens)


           country      misc       pop       rap        rb      rock  \
token                                                                  
sampo     0.043478  0.018519  0.000252  0.001961  0.004587  0.001845   
lubak     0.043478  0.018519  0.000755  0.003922  0.009174  0.001845   
umusap    0.043478  0.018519  0.000252  0.001961  0.004587  0.001845   
waley     0.043478  0.018519  0.001006  0.000980  0.004587  0.001845   
wooohhhh  0.043478  0.018519  0.001258  0.000980  0.004587  0.001845   

          country_ratio  misc_ratio  pop_ratio  rap_ratio  rb_ratio  \
token                                                                 
sampo          1.600644    0.011205   0.000150   0.001167  0.002732   
lubak          1.270776    0.013924   0.000554   0.002886  0.006762   
umusap         1.600644    0.011205   0.000150   0.001167  0.002732   
waley          1.614065    0.011116   0.000594   0.000578  0.002711   
wooohhhh       1.599134    0.011215   0.000749   0.000583  0.002734  

In [119]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('pop_ratio', ascending=False)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock,country_ratio,misc_ratio,pop_ratio,rap_ratio,rb_ratio,rock_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
la,0.043478,0.018519,0.427817,0.083333,0.591743,0.204797,0.032784,0.013381,0.432998,0.047201,0.453684,0.095479
pasko,0.043478,0.018519,0.144366,0.020588,0.004587,0.009225,0.220383,0.041838,0.402563,0.023265,0.004963,0.009978
pagibig,0.391304,0.555556,0.794769,0.285294,0.357798,0.435424,0.161107,0.229029,0.329027,0.087675,0.109446,0.131907
piling,0.043478,0.074074,0.170272,0.082353,0.128440,0.110701,0.076838,0.121020,0.267342,0.083003,0.124811,0.094485
kailan,0.130435,0.148148,0.310111,0.200980,0.298165,0.184502,0.114225,0.119627,0.259272,0.128464,0.186848,0.097297
...,...,...,...,...,...,...,...,...,...,...,...,...
yododolehiyo,0.260870,0.018519,0.000252,0.000980,0.004587,0.001845,9.963473,0.001810,0.000025,0.000096,0.000448,0.000180
selim,0.347826,0.018519,0.000252,0.000980,0.004587,0.001845,13.284630,0.001358,0.000018,0.000072,0.000336,0.000135
ulipon,0.434783,0.018519,0.000252,0.000980,0.004587,0.001845,16.605788,0.001086,0.000015,0.000057,0.000269,0.000108
pamahiin,0.608696,0.018519,0.000252,0.002941,0.004587,0.001845,21.628379,0.000832,0.000011,0.000132,0.000206,0.000083


## Part 6: Tuning the vectorizer

In [120]:
# show default parameters for CountVectorizer
vectorizer

In [121]:
metrics.accuracy_score(y_test, y_pred_class)

0.7546264564770391

In [122]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Define the parameters to search
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Test unigrams and bigrams
    'vect__max_df': [0.5, 0.75, 1.0],       # Test different maximum document frequencies
    'vect__min_df': [1, 2, 5],               # Test different minimum document frequencies
    'vect__stop_words': [None, 'english'],    # Test with and without stopwords
    'clf__alpha': [0.1, 0.5, 1.0]            # Test different alpha values for Laplace smoothing
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best accuracy found
print("Best Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'clf__alpha': 0.5, 'vect__max_df': 1.0, 'vect__min_df': 1, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
Best Accuracy: 0.7531284127103112


In [123]:
# Extract the best parameters found by grid search
best_params = grid_search.best_params_
best_params

{'clf__alpha': 0.5,
 'vect__max_df': 1.0,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': 'english'}

In [125]:
# Initialize CountVectorizer with the best parameters
vectorizer = CountVectorizer(ngram_range=best_params['vect__ngram_range'],
                             max_df=best_params['vect__max_df'],
                             min_df=best_params['vect__min_df'],
                             stop_words=best_params['vect__stop_words'])

# Fit the vectorizer to the entire training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize and train the Multinomial Naive Bayes classifier
clf = MultinomialNB(alpha=best_params["clf__alpha"])
clf.fit(X_train_vectorized, y_train)

# Predict on the test data
X_test_vectorized = vectorizer.transform(X_test)
y_pred_class = clf.predict(X_test_vectorized)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.750514050719671
