In [1]:
import pandas as pd
import nltk
import tensorflow as tf

In [27]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

from itertools import chain

from tensorboard import summary as summary_lib
from tensorboard.plugins import projector


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Creating Custom Glove Embeddings for lyrical data representation

I will be using the corpus of the entire lyrics set, not just the ones that I have randomly chosen for my training and testing sets.

In [3]:
corpus_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/train.csv')
print(len(corpus_data))
corpus_data.head()

290183


Unnamed: 0,Artist,Song,Genre,Language,Lyrics
0,12 stones,world so cold,Rock,en,"It starts with pain, followed by hate\nFueled ..."
1,12 stones,broken,Rock,en,Freedom!\nAlone again again alone\nPatiently w...
2,12 stones,3 leaf loser,Rock,en,"Biting the hand that feeds you, lying to the v..."
3,12 stones,anthem for the underdog,Rock,en,You say you know just who I am\nBut you can't ...
4,12 stones,adrenaline,Rock,en,My heart is beating faster can't control these...


First, I must process the data in order to get one singular text file for the corpus. In the context of this project, the corpus is the collection of songs, and the documents are the lyrics of each individual song.

Since the songs need to be seperated by new line characters, but the song lyrics as is contain new line characters that preserve the structure of the songs, I must clean the Lyrics data and replace all new line characters with whitespace.

I will only choose the english songs, but will include songs of all genres.

In [4]:
corpus_data = corpus_data.drop(corpus_data[corpus_data['Language'] != 'en'].index)
print(len(corpus_data))

250197


In [5]:
corpus_data['Lyrics'] = corpus_data['Lyrics'].str.replace('\n', ' ')

In [6]:
corpus_data.head()

Unnamed: 0,Artist,Song,Genre,Language,Lyrics
0,12 stones,world so cold,Rock,en,"It starts with pain, followed by hate Fueled b..."
1,12 stones,broken,Rock,en,Freedom! Alone again again alone Patiently wai...
2,12 stones,3 leaf loser,Rock,en,"Biting the hand that feeds you, lying to the v..."
3,12 stones,anthem for the underdog,Rock,en,You say you know just who I am But you can't i...
4,12 stones,adrenaline,Rock,en,My heart is beating faster can't control these...


Next, I will combine all of the lyrics and join them by \n, and put them into a txt file.

In [7]:
file_path = '/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/lyrics_corpus.txt'

with open(file_path, 'w') as file:
    values = corpus_data['Lyrics'].astype(str).values
    file.write('\n'.join(values)+ '\n')

Confirming that the file was written in correctly.

In [8]:
file_path = '/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/lyrics_corpus.txt'

with open(file_path, 'r') as file:
    lines = [file.readline().strip() for _ in range(10)]

for line in lines:
    print(line)

It starts with pain, followed by hate Fueled by the endless questions no one can answer A stain covers your heart and tears you apart Just like a sleeping cancer I don't believe men are born to be killers I don't believe the world can be saved How did you get here and when did it start? An innocent child with a thorn in his heart What kind of world do we live in? Where love is divided by hate Loosing control of our feelings We all must be dreaming this life away In a world so cold Are you sane, where's the shame? A moment of time passes by you cannot rewind Who's to blame and where did it start? Is there a cure for your sickness Have you no heart? I don't believe men are born to be killers I don't believe the world can't be saved How did you get here and when did it start? An innocent child with a thorn in his heart What kind of world do we live in? Where love is divided by hate Selling our soul for no reason We all must be dreaming this life away In a world so cold, In a world so cold

Next, I must tokenize the corpus in order to make the embeddings vectors.

I will use NLTK package and lemmatization, taking the unique words from the corpus that are not stop words.

In [9]:
file_path = '/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/lyrics_corpus.txt'

with open(file_path, 'r') as file:
    corpus_contents = file.read()

In [10]:
nltk_tokens = word_tokenize(corpus_contents)
print(len(nltk_tokens))

70969510


In [11]:
nltk_lemmatizer = WordNetLemmatizer()
nltk_lemmas = [nltk_lemmatizer.lemmatize(token) for token in nltk_tokens]
print(len(nltk_lemmas))

70969510


In [12]:
# reduce down to unique words
corpus_vocabulary = list(set(nltk_lemmas))
len(corpus_vocabulary)

286078

In [13]:
corpus_vocab = [word for word in corpus_vocabulary if word.lower() not in set(stopwords.words('english'))]
print(len(corpus_vocab))

285566


#### Tokenize/Preprocess the cleaned and separated training data to create custom embeddings

In [14]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/train_cleaned_data.csv')

In [15]:
train_data

Unnamed: 0.1,Unnamed: 0,Artist,Song,Genre,Language,Lyrics
0,29062,keane,maybe i can change,Rock,en,"Now, will it ever be the same again?\nAnd it w..."
1,171666,grouplove,ways to go,Rock,en,I didn't ask for this\nYou give me heart attac...
2,116911,pink,living in my world,Rock,en,So many rights you have\nLeave your girl\nBut ...
3,54524,sonic youth,contre le sexisme,Rock,en,"describe the touch, no itâ€™s the thought\nitâ..."
4,194015,the birthday massacre,weekend,Rock,en,"Lights out, boys sleepwalking on the weekend\n..."
...,...,...,...,...,...,...
18895,225434,steve,zealous core,R&B,en,"Well, where's the rhythm I've been searching f..."
18896,203223,smokie norful,jesus is love,R&B,en,Father\nHelp Your children\nAnd don't let them...
18897,257528,jacob banks,prosecco,R&B,en,you treat me like obama\ni barely know you\nyo...
18898,273294,antighost,skys,R&B,en,And I'm drowning alone in this body of water S...


I need to create a corpus array that has each of the songs in it.

In [16]:
# small version with 5 iterations for testing purposes
#training_corpus = []
#for each_song in range(0, 5):
  #training_corpus.append(train_data['Lyrics'][each_song])

In [17]:
#training_corpus

In [18]:
training_corpus = []
for each_song in range(0, len(train_data)):
  training_corpus.append(train_data['Lyrics'][each_song])
print('Corpus has been built.')

Corpus has been built.


Now, I must tokenize my training corpus.

In [19]:
# tokenizing each document into sentences and then into words, to preserve song structure potential meaning
tokenized_train_corpus = []
for song in training_corpus:
    lyrics = sent_tokenize(song)
    tokenized_lyrics = [word_tokenize(lyric.lower()) for lyric in lyrics]
    tokenized_train_corpus.append(tokenized_lyrics)

In [23]:
tokenized_train_corpus

Output hidden; open in https://colab.research.google.com to view.

### Training the glove model for embeddings using the vocabulary of the entire song dataset, and the sentences of my training set.

Create the glove model using the vocabulary extracted in the steps above.

In [28]:
flattened_corpus = list(chain.from_iterable(tokenized_train_corpus))

In [76]:
# define and train Word2Vec model
custom_model = Word2Vec(flattened_corpus, vector_size=100, window=10, min_count=1, workers=4, sg=1)

custom_model.wv.save_word2vec_format('custom_model_lyrics.model')

In [78]:
custom_model_test = Word2Vec.load('custom_model_lyrics.model')

UnpicklingError: ignored

In [77]:
similar_words = custom_model_test.wv.most_similar('weather', topn=5)
similar_words

NameError: ignored

visualizing the custom embeddings

In [79]:
!python -m gensim.scripts.word2vec2tensor -i /content/custom_model_lyrics.model -o /tmp/my_model_prefix

2023-11-25 22:24:51,702 - word2vec2tensor - INFO - running /usr/local/lib/python3.10/dist-packages/gensim/scripts/word2vec2tensor.py -i /content/custom_model_lyrics.model -o /tmp/my_model_prefix
2023-11-25 22:24:51,702 - keyedvectors - INFO - loading projection weights from /content/custom_model_lyrics.model
2023-11-25 22:24:55,554 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (64052, 100) matrix of type float32 from /content/custom_model_lyrics.model', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-11-25T22:24:55.552212', 'gensim': '4.3.2', 'python': '3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]', 'platform': 'Linux-5.15.120+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
2023-11-25 22:24:59,548 - word2vec2tensor - INFO - 2D tensor file saved to /tmp/my_model_prefix_tensor.tsv
2023-11-25 22:24:59,548 - word2vec2tensor - INFO - Tensor metadata file saved to /tmp/my_model_prefix_metadata.tsv
2023-11-25 22:24:59,550 - word2vec2tensor - INFO - finis