In [43]:
#ok lets do this haha

## Part 1: Preliminaries

In [44]:
#import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk as nltk
from nltk.corpus import stopwords
import string

## Part 2: Reading a text-based dataset into pandas

In [45]:
# Load the CSV file
data = pd.read_csv('processed_lyrics.csv')

In [46]:
# Extract the column names
column_names = data.columns.tolist()

# Print the column names
print("Column Names:", column_names)

Column Names: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language', 'lyric_length']


In [47]:
data.tag.value_counts()

tag
pop        4970
rap        1275
rock        678
rb          273
misc         67
country      28
Name: count, dtype: int64

In [48]:
data['label_num'] = data.tag.map({'country':0, 'misc':1, 'pop':2, 'rap':3, 'rb':4, 'rock':5})
data.drop('views', axis=1, inplace=True)
data.drop('language_cld3', axis=1, inplace=True)
data.drop('language_ft', axis=1, inplace=True)

In [49]:
data.head(10)

Unnamed: 0,title,tag,artist,year,features,lyrics,id,language,lyric_length,label_num
0,All star,rap,Enchant boyz,2012,{Lil_john},nang unang bibira mabomba bara mag ihaw gamit ...,64812,fil,112,3
1,Balak ni Syke,rap,Gloc-9,2012,{},alak balak lasing kasalukuyan ngunit malaman a...,85710,fil,158,3
2,Apatnapungbara,rap,Gloc-9,2012,"{""Ian Tayao""}",hook ian tayao akoy tutula mahaba nako umupo p...,85711,fil,205,3
3,Silup,rap,Gloc-9,2012,"{""Denise Barcena""}",hook denise mamang pulis pwede ba akong huming...,85713,fil,221,3
4,By Repablikan Syndicate Siobal D,rap,Pagbigyan,2012,{Military-g},pagbiyan mo puso umibig katulafd mo limutin yo...,88704,fil,29,3
5,New Life Song Part Two,rap,R1 one 6 souljhaz,2012,{},choros dios buhay ikay pasasalamtan awitin nil...,102239,fil,40,3
6,New Life Song Part Two,rap,R.1 One Six Souljhaz,2012,{Van.rey},choros dios buhay ikay pasasalamtan awitin nil...,102242,fil,40,3
7,Luha,rap,Repabablikan productions,2013,{Repablikan},lyrics of luha – repablikan magpaparaya mahal ...,178228,fil,224,3
8,NutriJingle,rap,Kamikzee,2013,{},kinukumpleto mo araw tuwing hinahain mo pagkai...,198397,fil,45,3
9,Yeah,rock,Kamikazee,2013,{},1 kinukumpleto mo araw tuwing hinahain mo pagk...,198400,fil,47,5


In [50]:
# Assume the lyrics are in a column named 'lyrics' and the genres are in a column named 'tag'
lyrics = data['lyrics']
genre = data['tag']

In [51]:
# Extracting the vocabulary 
from collections import Counter

# Apply basic splitting to each of the lyrics
split_lyrics = data['lyrics'].apply(lambda x: x.split())

# Ensure each list contains only unique words
each_unique_words = split_lyrics.apply(lambda x: list(set(x)))

# Combine all lists into one list
all_words_list = [word for sublist in each_unique_words for word in sublist]

# Extract overall unique words list and label as vocab
vocab = unique_words = set(all_words_list)

# Number of unique words
num_elements = len(vocab)

In [52]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(lyrics, genre, test_size=0.2, stratify=genre, random_state=42)

## Part 3: Vectorizing our dataset

In [53]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
vectorizer.fit(X_train)


In [54]:
# examine the fitted vocabulary
vectorizer.get_feature_names_out()

array(['01', '02', '03', ..., '雖然我很愛你', '𝘉𝘦𝘢𝘵𝘴', '𝘊𝘩𝘢𝘸'], dtype=object)

In [55]:
# fit and transform training data into a 'document-term matrix'
X_train_dtm = vectorizer.fit_transform(X_train)

In [56]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,01,02,03,04,05,06,07,08,09,0917,...,今はまだ悲しい,但只希望的愛,我心中易有親愛,我沒辦法告訴你,我真的没办法,新しい歌,的爱你,雖然我很愛你,𝘉𝘦𝘢𝘵𝘴,𝘊𝘩𝘢𝘸
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5830,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# transform testing data into a document-term matrix (using existing vocabulary)
X_test_dtm = vectorizer.transform(X_test)
X_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [58]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_test_dtm.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,01,02,03,04,05,06,07,08,09,0917,...,今はまだ悲しい,但只希望的愛,我心中易有親愛,我沒辦法告訴你,我真的没办法,新しい歌,的爱你,雖然我很愛你,𝘉𝘦𝘢𝘵𝘴,𝘊𝘩𝘢𝘸
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Part 4: Building and evaluating a model

In [59]:
# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_dtm, y_train)


In [60]:
# Make predictions on the test data
y_pred_class = nb_classifier.predict(X_test_dtm)


In [61]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.746401644962303

In [62]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[  0,   0,   6,   0,   0,   0],
       [  0,   0,  12,   0,   0,   1],
       [  0,   1, 896,  82,   5,  10],
       [  0,   0,  68, 187,   0,   0],
       [  0,   0,  38,  17,   0,   0],
       [  0,   0, 112,  17,   1,   6]], dtype=int64)

In [63]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

     country       0.00      0.00      0.00         6
        misc       0.00      0.00      0.00        13
         pop       0.79      0.90      0.84       994
         rap       0.62      0.73      0.67       255
          rb       0.00      0.00      0.00        55
        rock       0.35      0.04      0.08       136

    accuracy                           0.75      1459
   macro avg       0.29      0.28      0.27      1459
weighted avg       0.68      0.75      0.70      1459



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [64]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb_classifier.predict_proba(X_test_dtm)
y_pred_prob

array([[2.28738530e-028, 8.72989658e-020, 1.00000000e+000,
        1.28645413e-014, 5.59059333e-015, 1.37575938e-010],
       [3.58259061e-068, 1.39288935e-054, 1.00000000e+000,
        1.79169598e-011, 7.48908160e-031, 2.35074929e-015],
       [5.20582301e-089, 5.26456452e-070, 1.00000000e+000,
        4.12618914e-028, 2.60133115e-037, 8.01948558e-023],
       ...,
       [1.97378812e-071, 4.23362334e-058, 1.00000000e+000,
        3.03543199e-027, 3.53290222e-035, 4.26937524e-025],
       [5.32249382e-096, 4.71805080e-074, 9.42975046e-001,
        5.70249540e-002, 2.45829884e-016, 2.56648658e-020],
       [2.04057257e-223, 8.43714049e-203, 1.00000000e+000,
        2.44297657e-050, 5.79559988e-042, 1.70192318e-041]])

In [65]:
# calculate AUC
auc_scores = metrics.roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
auc_scores

0.7148977674935058

## Part 5: Examining a model for further insight

In [66]:
# store the vocabulary of X_train
X_train_tokens = vectorizer.get_feature_names_out()
len(X_train_tokens)

45402

In [67]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['01' '02' '03' '04' '05' '06' '07' '08' '09' '0917' '10' '100' '1008'
 '1020' '1096' '11' '11x' '12' '1234' '12gauge' '12x' '13' '13th' '13x'
 '14' '14344' '15' '15x' '16' '168' '16x' '17' '18' '1861' '187' '1896'
 '19' '1976' '1982' '1997' '1998' '1999' '1fritz' '1igalaw' '1loonie' '1s'
 '1shortone' '1st' '1x' '20']


In [68]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['zephanie' 'zeppelin' 'zero' 'zesto' 'zeus' 'zev' 'zhayt' 'zhen' 'zia'
 'zido' 'zild' 'zimzalabim' 'zio' 'zjay' 'zo' 'zoids' 'zoilo' 'zombie'
 'zone' 'zonin' 'zoo' 'zoom' 'zoomusulong' 'zoren' 'zsa' 'zuriel' 'zzp'
 '½t' '½y' 'ísipin' 'óo' 'ýo' 'ýyong' 'ʼko' 'еdad' 'еven' 'いつか誰かとまた恋に落ちても'
 'いつもあなただけの場所があるから' 'うたえるまで' 'ㅗㅏㅣoh' '今はまだ悲しい' '但只希望的愛' '我心中易有親愛'
 '我沒辦法告訴你' '我真的没办法' '新しい歌' '的爱你' '雖然我很愛你' '𝘉𝘦𝘢𝘵𝘴' '𝘊𝘩𝘢𝘸']


In [69]:
# Naive Bayes counts the number of times each token appears in each class
nb_classifier.feature_count_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 1., 3., ..., 0., 1., 4.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [70]:
# rows represent classes, columns represent tokens
nb_classifier.feature_count_.shape

(6, 45402)

 country       0.00      0.00      0.00         4
        misc       0.00      0.00      0.00        15
         pop       0.80      0.89      0.84       997
         rap       0.59      0.70      0.64       266
          rb       0.31      0.07      0.11        58
        rock    

In [71]:
country_token_count = nb_classifier.feature_count_[0, :]
misc_token_count = nb_classifier.feature_count_[1, :]
pop_token_count = nb_classifier.feature_count_[2, :]
rap_token_count = nb_classifier.feature_count_[3, :]
rb_token_count = nb_classifier.feature_count_[4, :]
rock_token_count = nb_classifier.feature_count_[5, :]
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'country':country_token_count, 'misc':misc_token_count, 'pop':pop_token_count, 'rap':rap_token_count, 'rb':rb_token_count, 'rock':rock_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,0.0
4,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0


In [72]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=427)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dalangin,2.0,0.0,164.0,47.0,11.0,13.0
obispo,0.0,0.0,0.0,1.0,0.0,0.0
kalilimutan,0.0,0.0,21.0,6.0,0.0,0.0
takbong,0.0,0.0,0.0,4.0,0.0,0.0
kabuwanang,0.0,0.0,0.0,1.0,0.0,0.0


In [73]:
# add 1 to tag counts to avoid dividing by 0 (1 point)
tokens[['country', 'misc', 'pop', 'rap', 'rb', 'rock']] = tokens[['country', 'misc', 'pop', 'rap', 'rb', 'rock']].apply(lambda x: x + 1)

In [74]:
# convert the tag counts into frequencies
tokens['country'] = tokens['country'] / nb_classifier.class_count_[0]
tokens['misc'] = tokens['misc'] / nb_classifier.class_count_[1]
tokens['pop'] = tokens['pop'] / nb_classifier.class_count_[2]
tokens['rap'] = tokens['rap'] / nb_classifier.class_count_[3]
tokens['rb'] = tokens['rb'] / nb_classifier.class_count_[4]
tokens['rock'] = tokens['rock'] / nb_classifier.class_count_[5]

tokens.sample(5, random_state=427)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dalangin,0.136364,0.018519,0.041499,0.047059,0.055046,0.02583
obispo,0.045455,0.018519,0.000252,0.001961,0.004587,0.001845
kalilimutan,0.045455,0.018519,0.005533,0.006863,0.004587,0.001845
takbong,0.045455,0.018519,0.000252,0.004902,0.004587,0.001845
kabuwanang,0.045455,0.018519,0.000252,0.001961,0.004587,0.001845


In [75]:
# Assuming you have a DataFrame called tokens with columns: 'token', 'genre1', 'genre2', ..., 'genre6'

# Calculate the ratio of each genre to all other genres for each token
for genre in ['country', 'misc', 'pop', 'rap', 'rb', 'rock']:
    other_genres = [col for col in tokens.columns if col != genre]
    tokens[f'{genre}_ratio'] = tokens[genre] / tokens[other_genres].sum(axis=1)

# Sample 5 rows from the DataFrame
sampled_tokens = tokens.sample(5, random_state=427)

# Print the sampled tokens with the calculated ratios
print(sampled_tokens)


              country      misc       pop       rap        rb      rock  \
token                                                                     
dalangin     0.136364  0.018519  0.041499  0.047059  0.055046  0.025830   
obispo       0.045455  0.018519  0.000252  0.001961  0.004587  0.001845   
kalilimutan  0.045455  0.018519  0.005533  0.006863  0.004587  0.001845   
takbong      0.045455  0.018519  0.000252  0.004902  0.004587  0.001845   
kabuwanang   0.045455  0.018519  0.000252  0.001961  0.004587  0.001845   

             country_ratio  misc_ratio  pop_ratio  rap_ratio  rb_ratio  \
token                                                                    
dalangin          0.725522    0.017956   0.040436   0.044346  0.050154   
obispo            1.673400    0.010720   0.000143   0.001117  0.002616   
kalilimutan       1.217099    0.014452   0.004228   0.005232  0.003477   
takbong           1.509909    0.011818   0.000157   0.003078  0.002874   
kabuwanang        1.673400    

In [76]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('pop_ratio', ascending=False)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock,country_ratio,misc_ratio,pop_ratio,rap_ratio,rb_ratio,rock_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pasko,0.045455,0.018519,0.151911,0.020588,0.004587,0.009225,0.221913,0.040819,0.420685,0.022547,0.004820,0.009692
la,0.045455,0.018519,0.412726,0.083333,0.591743,0.204797,0.034669,0.013490,0.416053,0.047963,0.463384,0.096269
pagibig,0.409091,0.555556,0.781690,0.285294,0.357798,0.435424,0.169342,0.227814,0.320323,0.087593,0.109347,0.131793
piling,0.045455,0.074074,0.168008,0.082353,0.128440,0.110701,0.080654,0.120326,0.261694,0.083239,0.125124,0.094668
yo,0.090909,0.314815,0.426811,0.330392,0.435780,0.315498,0.049860,0.190884,0.246977,0.159492,0.205011,0.128726
...,...,...,...,...,...,...,...,...,...,...,...,...
yododolehiyo,0.272727,0.018519,0.000252,0.000980,0.004587,0.001845,10.416358,0.001731,0.000023,0.000091,0.000428,0.000172
adedele,0.272727,0.018519,0.000252,0.000980,0.004587,0.001845,10.416358,0.001731,0.000023,0.000091,0.000428,0.000172
pamahiin,0.636364,0.018519,0.000503,0.002941,0.004587,0.001845,22.411205,0.000803,0.000022,0.000127,0.000199,0.000080
ulipon,0.454545,0.018519,0.000252,0.000980,0.004587,0.001845,17.360597,0.001039,0.000014,0.000055,0.000257,0.000103


## Part 6: Tuning the vectorizer

In [77]:
# show default parameters for CountVectorizer
vectorizer

In [78]:
# include 1-grams and 2-grams
vectorizer = CountVectorizer(ngram_range=(1, 2))

In [79]:
# ignore terms that appear in more than 50% of the documents
vectorizer = CountVectorizer(max_df=0.5)

In [80]:
# only keep terms that appear in at least 2 documents
vectorizer = CountVectorizer(min_df=2)

In [81]:
metrics.accuracy_score(y_test, y_pred_class)

0.746401644962303

In [82]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Define the parameters to search
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Test unigrams and bigrams
    'vect__max_df': [0.5, 0.75, 1.0],       # Test different maximum document frequencies
    'vect__min_df': [1, 2, 5]               # Test different minimum document frequencies
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best accuracy found
print("Best Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'vect__max_df': 0.5, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
Best Accuracy: 0.7469130358736024


In [83]:
# Extract the best parameters found by grid search
best_params = grid_search.best_params_
best_params

{'vect__max_df': 0.5, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}

In [84]:
# Initialize CountVectorizer with the best parameters
vectorizer = CountVectorizer(ngram_range=best_params['vect__ngram_range'],
                             max_df=best_params['vect__max_df'],
                             min_df=best_params['vect__min_df'])

# Fit the vectorizer to the entire training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize and train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

# Predict on the test data
X_test_vectorized = vectorizer.transform(X_test)
y_pred_class = clf.predict(X_test_vectorized)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.7457162440027416
