In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

import nltk
from nltk.corpus import stopwords

## TRAINING WORD2VEC MODEL ON CORPUS

In [2]:
df = pd.read_pickle('/work/NLP_Project/word2vec_tokenized.pkl')

In [3]:
df

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_word_count,tokenized_lyrics
0,Killa Cam,rap,Cam'ron,2004,killa cam killa cam cam killa cam killa cam k...,1,762,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,rap,JAY-Z,1996,yeah hah yeah rocafella we invite you to so...,3,548,"[yeah, hah, yeah, rocafella, we, invite, you, ..."
2,Forgive Me Father,rap,Fabolous,2003,maybe cause im eatin and these bastards fiend ...,4,574,"[maybe, cause, im, eatin, and, these, bastards..."
3,Down and Out,rap,Cam'ron,2004,ugh killa baby kanye this that 1970s heron ...,5,760,"[ugh, killa, baby, kanye, this, that, 1970s, h..."
4,Fly In,rap,Lil Wayne,2005,so they ask me young boy what you gon do the ...,6,432,"[so, they, ask, me, young, boy, what, you, gon..."
...,...,...,...,...,...,...,...,...
5913399,Everything Is Alright Now,pop,Chuck Bernard,2013,everything is alright now oh yes baby everythi...,7882838,63,"[everything, is, alright, now, oh, yes, baby, ..."
5913401,White Lies,pop,ElementD,2019,half truth and half you didnt we say were thr...,7882840,171,"[half, truth, and, half, you, didnt, we, say, ..."
5913403,Ocean,pop,Effemar,2022,dance for me now keeping yourself moving your...,7882842,166,"[dance, for, me, now, keeping, yourself, movin..."
5913406,Raise Our Hands,pop,"Culture Code, Pag & Mylo",2016,here our purpose feels alive we are more than...,7882845,184,"[here, our, purpose, feels, alive, we, are, mo..."


In [5]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(df['tokenized_lyrics'].to_list(), min_count=5, workers=31, window=5)


In [6]:
w2v_model.save("original_w2v.model")

## LSTM

In [4]:
from gensim.models import Word2Vec

w2v_model = Word2Vec.load("original_w2v.model")

In [5]:
# STEP 1 - TOKENIZE WORDS TO INDICES

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

# tokenizer needs a list of texts - df column is a Series - pass list of lists 
tokenizer.fit_on_texts(df['tokenized_lyrics'].tolist())




2023-05-23 20:00:15.234629: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
#STEP 2 - TRANSFORM TEXT TO SEQUENCES - TOKENIZER CONVERTS LYRICS INTO SEQUENCE OF INTEGERS

sequences = tokenizer.texts_to_sequences(df['tokenized_lyrics'].tolist())


In [7]:
#STEP 3 - CALCULATE SEQUENCE LENGTH - 75TH PERCENTILE AS TOO LARGE TO TAKE MORE - 374 WORDS

#calculate 75th percentile seq length
lengths = [len(sequence) for sequence in sequences]
max_sequence_length = int(np.percentile(lengths, 75))

In [8]:
max_sequence_length

374

In [9]:
#STEP 4 - TRUNCATE OR PAD LYRICS TO THE 374TH INTEGER

from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(sequences, maxlen=max_sequence_length)


In [11]:
len(sequences)

3315185

In [12]:
#STEP 5 - CREATING AN 'EMBEDDING MATRIX' - NUMWORDS * EMBEDDING DIMENSION - EACH ROW REPRESENTS A WORDS EMBEDDING VECTOR.
# ITERATE OVER EACH WORD IN THE TOKENIZER VOCAB (ALL WORDS FROM THE TOKENIZED LYRICS COLUMN) - IF EXISTS, INCLUDE IN EMBEDDING MATRIX

#columns in embedding matrix - same size as word2vec vector
embedding_dim = 100  

#zero matrix 
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

#iterate through through tokenizer vocab - if word is in the word2vec model vocab, find vector and add it to matrix at same index
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv.key_to_index:
        embedding_vector = w2v_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [14]:
embedding_matrix.shape

(3032678, 100)

In [17]:
# STEP 6 - BUILD MODEL, WHICH INCLUDES EMBEDDING LAYER. WE HAVE PRE-TRAINED OUR EMBEDDINGS WITH THE WORD2VEC EMBEDDINGS SO IT DOES NOT TRAIN / LEARN FROM THE DATA IN A STANDARD WAY.
# WE NEED THIS EMBEDDING LAYER AS CAN'T FEED RAW WORDS INTO NN - TOO SPARSE. 
# INPUT DIM = VOCAB SIZE
# OUTPUT DIM = VECTOR SPACE SIZE IN WHICH WORDS ARE EMBEDDED - WE CHOSE 100 IN WORD2VEC
# WEIGHTS = EMBEDDING MATRIX CHOSEN. THE ITH ROW IS THE PRE-TRAINED VECTOR THE WORD OF INDEX I
# INPUT LENGTH = THE MAX LENGTH WE FEED IN - WE TRUNCATED/PADDED TO 374
# DROPOUT - HELPS TO PREVENT OVERTFITTING BY ADDING NOISE TO OUTPUTS - GENERALIZES BETTER
# RECURRENT DROPOUT - APPLIED TO RECURRENT INPUTS - RANDOMLY SETS FRACTION OF INPUT UNITS TO 0 AT EACH UPDATE

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(set(df['tag'])), activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 374, 100)          303267800 
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 303,310,365
Trainable params: 42,565
Non-trainable params: 303,267,800
_________________________________________________________________


In [18]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

#genres converted to integer labels and then into one-hot format for categorical cross entropy 
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['tag'])
categorical_labels = to_categorical(integer_encoded)

X_temp, X_test, y_temp, y_test = train_test_split(sequences, categorical_labels, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)


Epoch 1/5