In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

import nltk
from nltk.corpus import stopwords

## TRAINING WORD2VEC MODEL ON CORPUS

In [2]:
df = pd.read_pickle('/work/NLP_Project/word2vec_tokenized.pkl')

In [6]:
from sklearn.model_selection import train_test_split

# Get the size of the subsample as a fraction of the whole data
subsample_fraction = 10000 / len(df)  # Replace 10000 with your desired subsample size

_, df_subsample = train_test_split(df, test_size=subsample_fraction, stratify=df['tag'], random_state=42)

# Reset the index for convenience
df_subsample.reset_index(drop=True, inplace=True)

df = df_subsample

ValueError: test_size=10.0 should be either positive and smaller than the number of samples 10000 or a float in the (0, 1) range

In [7]:
df

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,tokenized_lyrics
0,The Bastard Son of Satan Jesus Christ,rock,Nefarium,2010,uproot my faith and i will kiss your brow naza...,872923,196,"[uproot, my, faith, and, i, will, kiss, your, ..."
1,Spooky,rap,E-40,2011,hard drugs you knuckle head and thugs fake id...,419287,462,"[hard, drugs, you, knuckle, head, and, thugs, ..."
2,Sweet Hours,pop,Beth Rowley,2008,hours please be kind to be today as i quickly ...,1590836,250,"[hours, please, be, kind, to, be, today, as, i..."
3,I Know but I Dont Know,rock,Blondie,1978,hey you know oh i dont know i know but i dont...,193412,168,"[hey, you, know, oh, i, dont, know, i, know, b..."
4,Maradona,rap,Fox,2016,golaaaaazooo golaaaaazooo diegoooool marado...,2871527,339,"[golaaaaazooo, golaaaaazooo, diegoooool, marad..."
...,...,...,...,...,...,...,...,...
9995,Awesome Is The Lord Most High,rock,Chris Tomlin,2006,great are you lord mighty in strength you are...,446254,109,"[great, are, you, lord, mighty, in, strength, ..."
9996,If I Catch You,pop,Michel Tel,2012,ow wow this way you going to kill me oh if i ...,207197,76,"[ow, wow, this, way, you, going, to, kill, me,..."
9997,Chains Off,rap,Reezy Mw,2019,intro reezy x excellent man fuck what they al...,5157418,482,"[intro, reezy, x, excellent, man, fuck, what, ..."
9998,Flowers n Perfume,pop,Wstdyth,2021,wakin in a dream think i remember this land sh...,6628215,295,"[wakin, in, a, dream, think, i, remember, this..."


In [5]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(df['tokenized_lyrics'].to_list(), min_count=5, workers=31, window=5)


In [6]:
w2v_model.save("original_w2v.model")

## LSTM

In [8]:
from gensim.models import Word2Vec

w2v_model = Word2Vec.load("/work/NLP_Project/GenreFromLyricsShared/Word2VecModels/original_w2v.model")

In [9]:
# STEP 1 - TOKENIZE WORDS TO INDICES

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

# tokenizer needs a list of texts - df column is a Series - pass list of lists 
tokenizer.fit_on_texts(df['tokenized_lyrics'].tolist())




2023-05-24 20:45:18.295920: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
#STEP 2 - TRANSFORM TEXT TO SEQUENCES - TOKENIZER CONVERTS LYRICS INTO SEQUENCE OF INTEGERS

sequences = tokenizer.texts_to_sequences(df['tokenized_lyrics'].tolist())


In [11]:
#STEP 3 - CALCULATE SEQUENCE LENGTH - 75TH PERCENTILE AS TOO LARGE TO TAKE MORE - 374 WORDS

#calculate 75th percentile seq length
lengths = [len(sequence) for sequence in sequences]
max_sequence_length = int(np.percentile(lengths, 75))

In [13]:
max_sequence_length

374

In [14]:
#STEP 4 - TRUNCATE OR PAD LYRICS TO THE 374TH INTEGER

from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(sequences, maxlen=max_sequence_length)


In [15]:
len(sequences)

10000

In [16]:
#STEP 5 - CREATING AN 'EMBEDDING MATRIX' - NUMWORDS * EMBEDDING DIMENSION - EACH ROW REPRESENTS A WORDS EMBEDDING VECTOR.
# ITERATE OVER EACH WORD IN THE TOKENIZER VOCAB (ALL WORDS FROM THE TOKENIZED LYRICS COLUMN) - IF EXISTS, INCLUDE IN EMBEDDING MATRIX

#columns in embedding matrix - same size as word2vec vector
embedding_dim = 100  

#zero matrix 
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

#iterate through through tokenizer vocab - if word is in the word2vec model vocab, find vector and add it to matrix at same index
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv.key_to_index:
        embedding_vector = w2v_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [17]:
embedding_matrix.shape

(65063, 100)

In [60]:
# STEP 6 - BUILD MODEL, WHICH INCLUDES EMBEDDING LAYER. WE HAVE PRE-TRAINED OUR EMBEDDINGS WITH THE WORD2VEC EMBEDDINGS SO IT DOES NOT TRAIN / LEARN FROM THE DATA IN A STANDARD WAY.
# WE NEED THIS EMBEDDING LAYER AS CAN'T FEED RAW WORDS INTO NN - TOO SPARSE. 
# INPUT DIM = VOCAB SIZE
# OUTPUT DIM = VECTOR SPACE SIZE IN WHICH WORDS ARE EMBEDDED - WE CHOSE 100 IN WORD2VEC
# WEIGHTS = EMBEDDING MATRIX CHOSEN. THE ITH ROW IS THE PRE-TRAINED VECTOR THE WORD OF INDEX I
# INPUT LENGTH = THE MAX LENGTH WE FEED IN - WE TRUNCATED/PADDED TO 374
# DROPOUT - HELPS TO PREVENT OVERTFITTING BY ADDING NOISE TO OUTPUTS - GENERALIZES BETTER
# RECURRENT DROPOUT - APPLIED TO RECURRENT INPUTS - RANDOMLY SETS FRACTION OF INPUT UNITS TO 0 AT EACH UPDATE

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(set(df['tag'])), activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 374, 100)          6506300   
                                                                 
 lstm_8 (LSTM)               (None, 128)               117248    
                                                                 
 dense_13 (Dense)            (None, 5)                 645       
                                                                 
Total params: 6,624,193
Trainable params: 117,893
Non-trainable params: 6,506,300
_________________________________________________________________


In [63]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Stacked LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False))
model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))  # Return sequences for stacked LSTMs
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(set(df['tag'])), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 374, 100)          6506300   
                                                                 
 lstm_9 (LSTM)               (None, 374, 128)          117248    
                                                                 
 lstm_10 (LSTM)              (None, 128)               131584    
                                                                 
 dense_14 (Dense)            (None, 5)                 645       
                                                                 
Total params: 6,755,777
Trainable params: 249,477
Non-trainable params: 6,506,300
_________________________________________________________________


In [68]:
from keras.layers import Bidirectional

# Bidirectional LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                                  output_dim=embedding_dim,
                                  weights=[embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=False))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(len(set(df['tag'])), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 374, 100)          6506300   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              234496    
 l)                                                              
                                                                 
 dense_15 (Dense)            (None, 5)                 1285      
                                                                 
Total params: 6,742,081
Trainable params: 235,781
Non-trainable params: 6,506,300
_________________________________________________________________


In [80]:
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.regularizers import l2

# Bidirectional LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                                  output_dim=embedding_dim,
                                  weights=[embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=False))
model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(len(set(df['tag'])), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


model.summary()




Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 374, 100)          6506300   
                                                                 
 bidirectional_3 (Bidirectio  (None, 256)              234496    
 nal)                                                            
                                                                 
 dense_18 (Dense)            (None, 5)                 1285      
                                                                 
Total params: 6,742,081
Trainable params: 235,781
Non-trainable params: 6,506,300
_________________________________________________________________


In [47]:
# STEP 6 - BUILD MODEL, WHICH INCLUDES EMBEDDING LAYER. WE HAVE PRE-TRAINED OUR EMBEDDINGS WITH THE WORD2VEC EMBEDDINGS SO IT DOES NOT TRAIN / LEARN FROM THE DATA IN A STANDARD WAY.
# WE NEED THIS EMBEDDING LAYER AS CAN'T FEED RAW WORDS INTO NN - TOO SPARSE. 
# INPUT DIM = VOCAB SIZE
# OUTPUT DIM = VECTOR SPACE SIZE IN WHICH WORDS ARE EMBEDDED - WE CHOSE 100 IN WORD2VEC
# WEIGHTS = EMBEDDING MATRIX CHOSEN. THE ITH ROW IS THE PRE-TRAINED VECTOR THE WORD OF INDEX I
# INPUT LENGTH = THE MAX LENGTH WE FEED IN - WE TRUNCATED/PADDED TO 374
# DROPOUT - HELPS TO PREVENT OVERTFITTING BY ADDING NOISE TO OUTPUTS - GENERALIZES BETTER
# RECURRENT DROPOUT - APPLIED TO RECURRENT INPUTS - RANDOMLY SETS FRACTION OF INPUT UNITS TO 0 AT EACH UPDATE

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(len(set(df['tag'])), activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 374, 100)          6506300   
                                                                 
 lstm_6 (LSTM)               (None, 100)               80400     
                                                                 
 dense_6 (Dense)             (None, 1024)              103424    
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_7 (Dense)             (None, 1024)              1049600   
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_8 (Dense)             (None, 5)                

In [57]:
# STEP 6 - BUILD MODEL, WHICH INCLUDES EMBEDDING LAYER. WE HAVE PRE-TRAINED OUR EMBEDDINGS WITH THE WORD2VEC EMBEDDINGS SO IT DOES NOT TRAIN / LEARN FROM THE DATA IN A STANDARD WAY.
# WE NEED THIS EMBEDDING LAYER AS CAN'T FEED RAW WORDS INTO NN - TOO SPARSE. 
# INPUT DIM = VOCAB SIZE
# OUTPUT DIM = VECTOR SPACE SIZE IN WHICH WORDS ARE EMBEDDED - WE CHOSE 100 IN WORD2VEC
# WEIGHTS = EMBEDDING MATRIX CHOSEN. THE ITH ROW IS THE PRE-TRAINED VECTOR THE WORD OF INDEX I
# INPUT LENGTH = THE MAX LENGTH WE FEED IN - WE TRUNCATED/PADDED TO 374
# DROPOUT - HELPS TO PREVENT OVERTFITTING BY ADDING NOISE TO OUTPUTS - GENERALIZES BETTER
# RECURRENT DROPOUT - APPLIED TO RECURRENT INPUTS - RANDOMLY SETS FRACTION OF INPUT UNITS TO 0 AT EACH UPDATE

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))
model.add(LSTM(64, dropout=0.2))
model.add(Dense(len(set(df['tag'])), activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 374, 100)          6506300   
                                                                 
 lstm_7 (LSTM)               (None, 64)                42240     
                                                                 
 dense_12 (Dense)            (None, 5)                 325       
                                                                 
Total params: 6,548,865
Trainable params: 42,565
Non-trainable params: 6,506,300
_________________________________________________________________


2023-05-24 21:42:05.268337: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-24 21:42:05.269583: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-24 21:42:05.271288: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [81]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

#genres converted to integer labels and then into one-hot format for categorical cross entropy 
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['tag'])
categorical_labels = to_categorical(integer_encoded)

X_temp, X_test, y_temp, y_test = train_test_split(sequences, categorical_labels, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


from sklearn.utils import class_weight
from numpy import argmax

# convert one-hot encoded y_train back to label encoded
y_train_labels = argmax(y_train, axis=1)

# calculate class weights
classes = np.unique(y_train_labels)
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=classes, y=y_train_labels)
class_weights_dict = dict(enumerate(class_weights))

# train the model with class weights
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, class_weight=class_weights_dict)




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
 21/110 [====>.........................] - ETA: 33s - loss: 0.4408 - accuracy: 0.7411

KeyboardInterrupt: 

In [43]:
class_weights_dict

{0: 7.6923076923076925,
 1: 0.45691906005221933,
 2: 0.6802721088435374,
 3: 4.093567251461988,
 4: 1.03397341211226}

In [70]:
from sklearn.metrics import classification_report
from numpy import argmax

# Predict class probabilities on the test set
y_prob = model.predict(X_test)

# Convert probabilities to class labels
y_pred = argmax(y_prob, axis=1)

# Convert one-hot encoded y_test to class labels
y_true = argmax(y_test, axis=1)

# Print classification report
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

     country       0.08      0.05      0.06        82
         pop       0.59      0.60      0.60      1298
         rap       0.81      0.80      0.80       894
          rb       0.11      0.13      0.12       123
        rock       0.38      0.37      0.38       603

    accuracy                           0.58      3000
   macro avg       0.39      0.39      0.39      3000
weighted avg       0.58      0.58      0.58      3000



In [76]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

#genres converted to integer labels and then into one-hot format for categorical cross entropy 
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['tag'])
categorical_labels = to_categorical(integer_encoded)

X_temp, X_test, y_temp, y_test = train_test_split(sequences, categorical_labels, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)


Epoch 1/5
 13/110 [==>...........................] - ETA: 35s - loss: 1.0809 - accuracy: 0.6442

KeyboardInterrupt: 

In [75]:
from sklearn.metrics import classification_report
from numpy import argmax

# Predict class probabilities on the test set
y_prob = model.predict(X_test)

# Convert probabilities to class labels
y_pred = argmax(y_prob, axis=1)

# Convert one-hot encoded y_test to class labels
y_true = argmax(y_test, axis=1)

# Print classification report
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

     country       0.00      0.00      0.00        82
         pop       0.58      0.79      0.67      1298
         rap       0.75      0.83      0.79       894
          rb       0.00      0.00      0.00       123
        rock       0.48      0.19      0.27       603

    accuracy                           0.63      3000
   macro avg       0.36      0.36      0.35      3000
weighted avg       0.57      0.63      0.58      3000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
class_weights

array([7.69230769, 0.45691906, 0.68027211, 4.09356725, 1.03397341])

In [None]:
labels = LabelEncoder().fit_transform(df['tag'])
sample_weights = dict(zip(labels, class_weights))