In [1]:
#ok lets do this haha

## Part 1: Preliminaries

In [2]:
#import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk as nltk
from nltk.corpus import stopwords
import string

## Part 2: Reading a text-based dataset into pandas

In [3]:
# Load the CSV file
data = pd.read_csv('processed_lyrics.csv')

In [4]:
# Extract the column names
column_names = data.columns.tolist()

# Print the column names
print("Column Names:", column_names)

Column Names: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language', 'lyric_length']


In [5]:
data.tag.value_counts()

tag
pop        4970
rap        1275
rock        678
rb          273
misc         67
country      28
Name: count, dtype: int64

In [6]:
data['label_num'] = data.tag.map({'country':0, 'misc':1, 'pop':2, 'rap':3, 'rb':4, 'rock':5})
data.drop('views', axis=1, inplace=True)
data.drop('language_cld3', axis=1, inplace=True)
data.drop('language_ft', axis=1, inplace=True)

In [7]:
data.head(10)

Unnamed: 0,title,tag,artist,year,features,lyrics,id,language,lyric_length,label_num
0,All star,rap,Enchant boyz,2012,{Lil_john},nang unang bibira mabomba bara mag ihaw gamit ...,64812,fil,112,3
1,Balak ni Syke,rap,Gloc-9,2012,{},alak balak lasing kasalukuyan ngunit malaman a...,85710,fil,158,3
2,Apatnapungbara,rap,Gloc-9,2012,"{""Ian Tayao""}",hook ian tayao akoy tutula mahaba nako umupo p...,85711,fil,205,3
3,Silup,rap,Gloc-9,2012,"{""Denise Barcena""}",hook denise mamang pulis pwede ba akong huming...,85713,fil,221,3
4,By Repablikan Syndicate Siobal D,rap,Pagbigyan,2012,{Military-g},pagbiyan mo puso umibig katulafd mo limutin yo...,88704,fil,29,3
5,New Life Song Part Two,rap,R1 one 6 souljhaz,2012,{},choros dios buhay ikay pasasalamtan awitin nil...,102239,fil,40,3
6,New Life Song Part Two,rap,R.1 One Six Souljhaz,2012,{Van.rey},choros dios buhay ikay pasasalamtan awitin nil...,102242,fil,40,3
7,Luha,rap,Repabablikan productions,2013,{Repablikan},lyrics of luha – repablikan magpaparaya mahal ...,178228,fil,224,3
8,NutriJingle,rap,Kamikzee,2013,{},kinukumpleto mo araw tuwing hinahain mo pagkai...,198397,fil,45,3
9,Yeah,rock,Kamikazee,2013,{},1 kinukumpleto mo araw tuwing hinahain mo pagk...,198400,fil,47,5


In [8]:
# Assume the lyrics are in a column named 'lyrics' and the genres are in a column named 'tag'
lyrics = data['lyrics']
genre = data['tag']

In [9]:
# Extracting the vocabulary 
from collections import Counter

# Apply basic splitting to each of the lyrics
split_lyrics = data['lyrics'].apply(lambda x: x.split())

# Ensure each list contains only unique words
each_unique_words = split_lyrics.apply(lambda x: list(set(x)))

# Combine all lists into one list
all_words_list = [word for sublist in each_unique_words for word in sublist]

# Extract overall unique words list and label as vocab
vocab = unique_words = set(all_words_list)

# Number of unique words
num_elements = len(vocab)

In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(lyrics, genre, test_size=0.2, random_state=42)

## Part 3: Vectorizing our dataset

In [11]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
vectorizer.fit(X_train)


In [12]:
# examine the fitted vocabulary
vectorizer.get_feature_names_out()

array(['01', '02', '03', ..., '今はまだ悲しい', '新しい歌', '𝘊𝘩𝘢𝘸'], dtype=object)

In [13]:
# fit and transform training data into a 'document-term matrix'
X_train_dtm = vectorizer.fit_transform(X_train)

In [14]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,01,02,03,04,05,06,07,08,09,0917,...,еto,еven,いつか誰かとまた恋に落ちても,いつもあなただけの場所があるから,うたえるまで,こんにちは,ㅗㅏㅣoh,今はまだ悲しい,新しい歌,𝘊𝘩𝘢𝘸
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5830,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# transform testing data into a document-term matrix (using existing vocabulary)
X_test_dtm = vectorizer.transform(X_test)
X_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_test_dtm.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,01,02,03,04,05,06,07,08,09,0917,...,еto,еven,いつか誰かとまた恋に落ちても,いつもあなただけの場所があるから,うたえるまで,こんにちは,ㅗㅏㅣoh,今はまだ悲しい,新しい歌,𝘊𝘩𝘢𝘸
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Part 4: Building and evaluating a model

In [17]:
# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_dtm, y_train)


In [18]:
# Make predictions on the test data
y_pred_class = nb_classifier.predict(X_test_dtm)


In [19]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7402330363262508

In [20]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[  0,   0,   4,   0,   0,   0],
       [  0,   0,  14,   1,   0,   0],
       [  0,   2, 889,  92,   6,   8],
       [  0,   0,  76, 185,   3,   2],
       [  0,   0,  34,  20,   4,   0],
       [  0,   1, 100,  16,   0,   2]], dtype=int64)

In [21]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

     country       0.00      0.00      0.00         4
        misc       0.00      0.00      0.00        15
         pop       0.80      0.89      0.84       997
         rap       0.59      0.70      0.64       266
          rb       0.31      0.07      0.11        58
        rock       0.17      0.02      0.03       119

    accuracy                           0.74      1459
   macro avg       0.31      0.28      0.27      1459
weighted avg       0.68      0.74      0.70      1459



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb_classifier.predict_proba(X_test_dtm)
y_pred_prob

array([[4.82677938e-122, 1.07810722e-094, 1.00000000e+000,
        4.57253354e-024, 6.13276588e-037, 2.53807255e-020],
       [1.79153426e-215, 6.05678934e-194, 4.29578606e-026,
        1.00000000e+000, 3.36181487e-084, 4.77669756e-065],
       [7.46446115e-056, 6.17293734e-037, 1.00000000e+000,
        3.80290837e-023, 2.81335911e-039, 2.15808094e-018],
       ...,
       [3.01707529e-130, 6.97835078e-104, 1.50472436e-013,
        1.00000000e+000, 3.03697733e-050, 6.94623822e-027],
       [2.10029222e-181, 4.66683441e-129, 1.00000000e+000,
        5.40056942e-039, 4.07663130e-058, 1.06565621e-052],
       [2.63724687e-112, 4.15880493e-093, 1.00000000e+000,
        1.21998256e-020, 1.01161311e-039, 1.22902283e-026]])

In [23]:
# calculate AUC
auc_scores = metrics.roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
auc_scores

0.7087233125883134

## Part 5: Examining a model for further insight

In [24]:
# store the vocabulary of X_train
X_train_tokens = vectorizer.get_feature_names_out()
len(X_train_tokens)

45301

In [25]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['01' '02' '03' '04' '05' '06' '07' '08' '09' '0917' '10' '100' '1020'
 '104' '1096' '10end' '11' '1187' '11x' '12' '1234' '12x' '13' '13th'
 '13x' '14' '143' '14344' '15' '15x' '16' '168' '16x' '17' '18' '187' '19'
 '1976' '1982' '1997' '1999' '1fritz' '1igalaw' '1loonie' '1s' '1shortone'
 '1st' '1strap' '1x' '20']


In [26]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['zamboanggenyo' 'zapote' 'zargon' 'zean' 'zebra' 'zel' 'zelijah'
 'zephanie' 'zero' 'zesto' 'zeus' 'zev' 'zhayt' 'zhen' 'zia' 'zido' 'zild'
 'zimzalabim' 'zio' 'zip' 'zjay' 'zo' 'zoids' 'zombie' 'zone' 'zonin'
 'zoo' 'zoom' 'zoomusulong' 'zoren' 'zsa' 'zuriel' 'zzp' 'äôy' 'ísipin'
 'óo' 'ýo' 'ýyong' 'ʼko' 'еdad' 'еto' 'еven' 'いつか誰かとまた恋に落ちても'
 'いつもあなただけの場所があるから' 'うたえるまで' 'こんにちは' 'ㅗㅏㅣoh' '今はまだ悲しい' '新しい歌' '𝘊𝘩𝘢𝘸']


In [27]:
# Naive Bayes counts the number of times each token appears in each class
nb_classifier.feature_count_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 3., ..., 3., 3., 4.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
# rows represent classes, columns represent tokens
nb_classifier.feature_count_.shape

(6, 45301)

 country       0.00      0.00      0.00         4
        misc       0.00      0.00      0.00        15
         pop       0.80      0.89      0.84       997
         rap       0.59      0.70      0.64       266
          rb       0.31      0.07      0.11        58
        rock    

In [29]:
country_token_count = nb_classifier.feature_count_[0, :]
misc_token_count = nb_classifier.feature_count_[1, :]
pop_token_count = nb_classifier.feature_count_[2, :]
rap_token_count = nb_classifier.feature_count_[3, :]
rb_token_count = nb_classifier.feature_count_[4, :]
rock_token_count = nb_classifier.feature_count_[5, :]
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'country':country_token_count, 'misc':misc_token_count, 'pop':pop_token_count, 'rap':rap_token_count, 'rb':rb_token_count, 'rock':rock_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,0.0
4,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=427)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
greenbelt,0.0,0.0,1.0,0.0,0.0,0.0
napapahinto,0.0,0.0,1.0,0.0,0.0,0.0
band,0.0,0.0,4.0,2.0,0.0,0.0
nagpapalakpakan,0.0,0.0,4.0,0.0,0.0,0.0
pinambili,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
# add 1 to tag counts to avoid dividing by 0 (1 point)
tokens[['country', 'misc', 'pop', 'rap', 'rb', 'rock']] = tokens[['country', 'misc', 'pop', 'rap', 'rb', 'rock']].apply(lambda x: x + 1)

In [32]:
# convert the tag counts into frequencies
tokens['country'] = tokens['country'] / nb_classifier.class_count_[0]
tokens['misc'] = tokens['misc'] / nb_classifier.class_count_[1]
tokens['pop'] = tokens['pop'] / nb_classifier.class_count_[2]
tokens['rap'] = tokens['rap'] / nb_classifier.class_count_[3]
tokens['rb'] = tokens['rb'] / nb_classifier.class_count_[4]
tokens['rock'] = tokens['rock'] / nb_classifier.class_count_[5]

tokens.sample(5, random_state=427)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
greenbelt,0.041667,0.019231,0.000503,0.000991,0.004651,0.001789
napapahinto,0.041667,0.019231,0.000503,0.000991,0.004651,0.001789
band,0.041667,0.019231,0.001258,0.002973,0.004651,0.001789
nagpapalakpakan,0.041667,0.019231,0.001258,0.000991,0.004651,0.001789
pinambili,0.041667,0.019231,0.000252,0.000991,0.004651,0.003578


In [33]:
# Assuming you have a DataFrame called tokens with columns: 'token', 'genre1', 'genre2', ..., 'genre6'

# Calculate the ratio of each genre to all other genres for each token
for genre in ['country', 'misc', 'pop', 'rap', 'rb', 'rock']:
    other_genres = [col for col in tokens.columns if col != genre]
    tokens[f'{genre}_ratio'] = tokens[genre] / tokens[other_genres].sum(axis=1)

# Sample 5 rows from the DataFrame
sampled_tokens = tokens.sample(5, random_state=427)

# Print the sampled tokens with the calculated ratios
print(sampled_tokens)


                  country      misc       pop       rap        rb      rock  \
token                                                                         
greenbelt        0.041667  0.019231  0.000503  0.000991  0.004651  0.001789   
napapahinto      0.041667  0.019231  0.000503  0.000991  0.004651  0.001789   
band             0.041667  0.019231  0.001258  0.002973  0.004651  0.001789   
nagpapalakpakan  0.041667  0.019231  0.001258  0.000991  0.004651  0.001789   
pinambili        0.041667  0.019231  0.000252  0.000991  0.004651  0.003578   

                 country_ratio  misc_ratio  pop_ratio  rap_ratio  rb_ratio  \
token                                                                        
greenbelt             1.533818    0.012145   0.000312   0.000614  0.002887   
napapahinto           1.533818    0.012145   0.000312   0.000614  0.002887   
band                  1.393414    0.013302   0.000852   0.002014  0.003150   
nagpapalakpakan       1.492337    0.012466   0.000800   

In [34]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('pop_ratio', ascending=False)

Unnamed: 0_level_0,country,misc,pop,rap,rb,rock,country_ratio,misc_ratio,pop_ratio,rap_ratio,rb_ratio,rock_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
la,0.041667,0.019231,0.419079,0.155600,0.641860,0.257603,0.027901,0.012458,0.362425,0.087306,0.464017,0.115437
pasko,0.041667,0.057692,0.154040,0.023786,0.004651,0.008945,0.167260,0.144106,0.343758,0.025795,0.004810,0.009244
piling,0.041667,0.076923,0.177448,0.073340,0.130233,0.103757,0.074180,0.128072,0.282483,0.072273,0.126422,0.087703
kapiling,0.041667,0.019231,0.099421,0.042616,0.065116,0.046512,0.152683,0.042924,0.242048,0.060057,0.087152,0.054533
koy,0.208333,0.346154,0.377297,0.266601,0.190698,0.134168,0.158438,0.259187,0.241303,0.139175,0.089502,0.058931
...,...,...,...,...,...,...,...,...,...,...,...,...
pamahiin,0.583333,0.019231,0.000503,0.003964,0.004651,0.001789,19.355050,0.000964,0.000025,0.000199,0.000233,0.000090
inayran,0.333333,0.019231,0.000252,0.000991,0.004651,0.001789,12.385303,0.001511,0.000020,0.000078,0.000365,0.000140
selim,0.458333,0.019231,0.000252,0.000991,0.004651,0.008945,13.452990,0.001381,0.000018,0.000071,0.000334,0.000642
tantananan,0.375000,0.019231,0.000252,0.000991,0.004651,0.001789,13.933466,0.001343,0.000018,0.000069,0.000325,0.000125


## Part 6: Tuning the vectorizer

In [35]:
# show default parameters for CountVectorizer
vectorizer

In [36]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))

In [37]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)

In [38]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)

In [39]:
metrics.accuracy_score(y_test, y_pred_class)

0.7402330363262508

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Define the parameters to search
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Test unigrams and bigrams
    'vect__max_df': [0.5, 0.75, 1.0],       # Test different maximum document frequencies
    'vect__min_df': [1, 2, 5]               # Test different minimum document frequencies
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best accuracy found
print("Best Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'vect__max_df': 1.0, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}
Best Accuracy: 0.7517160742605764


In [48]:
# Extract the best parameters found by grid search
best_params = grid_search.best_params_
best_params

{'vect__max_df': 1.0, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}

In [49]:
# Initialize CountVectorizer with the best parameters
vectorizer = CountVectorizer(ngram_range=best_params['vect__ngram_range'],
                             max_df=best_params['vect__max_df'],
                             min_df=best_params['vect__min_df'])

# Fit the vectorizer to the entire training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize and train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

# Predict on the test data
X_test_vectorized = vectorizer.transform(X_test)
y_pred_class = clf.predict(X_test_vectorized)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.7402330363262508
