In [1]:
import pandas as pd
import nltk
import tensorflow as tf

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

from itertools import chain

from tensorboard import summary as summary_lib
from tensorboard.plugins import projector


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Creating A Word2Vec Representation for cleaned training set

In [3]:
corpus_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/train_full_cleaned_data.csv')

In [4]:
full_corpus = []
for each_song in range(0, len(corpus_data)):
  full_corpus.append(corpus_data['Lyrics'][each_song])
print('Corpus has been built.')

Corpus has been built.


In [5]:
tokenized_full_corpus = []
for song in full_corpus:
    lyrics = sent_tokenize(song)
    tokenized_lyrics = [word_tokenize(lyric.lower()) for lyric in lyrics]
    tokenized_full_corpus.append(tokenized_lyrics)

TypeError: ignored

In [None]:
flattened_full_corpus = list(chain.from_iterable(tokenized_full_corpus))

In [None]:
# define and train Word2Vec model
custom_model = Word2Vec(flattened_full_corpus, vector_size=100, window=15, min_count=1, workers=4, sg=1)

custom_model.wv.save_word2vec_format('custom_model_train_lyrics.model')

In [None]:
!python -m gensim.scripts.word2vec2tensor -i /content/custom_model_train_lyrics.model -o /tmp/my_model_train_prefix

### Creating A Word-2-Vec representation for all words in the lyrics corpus

I will be using the corpus of the entire lyrics set, not just the ones that I have randomly chosen for my training and testing sets.

In [None]:
corpus_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA340_NLP_FINAL/train.csv')
print(len(corpus_data))
corpus_data.head()

290183


Unnamed: 0,Artist,Song,Genre,Language,Lyrics
0,12 stones,world so cold,Rock,en,"It starts with pain, followed by hate\nFueled ..."
1,12 stones,broken,Rock,en,Freedom!\nAlone again again alone\nPatiently w...
2,12 stones,3 leaf loser,Rock,en,"Biting the hand that feeds you, lying to the v..."
3,12 stones,anthem for the underdog,Rock,en,You say you know just who I am\nBut you can't ...
4,12 stones,adrenaline,Rock,en,My heart is beating faster can't control these...


I will only choose the english songs, but will include songs of all genres.

In [None]:
corpus_data = corpus_data.drop(corpus_data[corpus_data['Language'] != 'en'].index).reset_index()
print(len(corpus_data))

250197


Next, I must tokenize the corpus in order to make the embeddings vectors.

I will use NLTK package and lemmatization, taking the unique words from the corpus that are not stop words.

#### Tokenize/Preprocess the cleaned corpus dataframe to create custom embeddings

I need to create a corpus array that has each of the songs in it.

In [None]:
full_corpus = []
for each_song in range(0, len(corpus_data)):
  full_corpus.append(corpus_data['Lyrics'][each_song])
print('Corpus has been built.')

Corpus has been built.


Now, I must tokenize my training corpus. Each document (song in this context) must be split into sentences and then into words, to preserve the structure of the song to allow for the model to discover any potential meaning in the song structure.

In [None]:
tokenized_full_corpus = []
for song in full_corpus:
    lyrics = sent_tokenize(song)
    tokenized_lyrics = [word_tokenize(lyric.lower()) for lyric in lyrics]
    tokenized_full_corpus.append(tokenized_lyrics)

### Training the word2vec model for embeddings using the entire corpus. I will then be able to use the tensorboard projector to take a high-level look at the word associations.

In [None]:
flattened_full_corpus = list(chain.from_iterable(tokenized_full_corpus))

In [None]:
# define and train Word2Vec model
custom_model = Word2Vec(flattened_full_corpus, vector_size=100, window=15, min_count=1, workers=4, sg=1)

custom_model.wv.save_word2vec_format('custom_model_full_lyrics.model')

KeyboardInterrupt: ignored

visualizing the custom embeddings

In [None]:
!python -m gensim.scripts.word2vec2tensor -i /content/custom_model_full_lyrics.model -o /tmp/my_model_full_prefix

2023-11-25 22:24:51,702 - word2vec2tensor - INFO - running /usr/local/lib/python3.10/dist-packages/gensim/scripts/word2vec2tensor.py -i /content/custom_model_lyrics.model -o /tmp/my_model_prefix
2023-11-25 22:24:51,702 - keyedvectors - INFO - loading projection weights from /content/custom_model_lyrics.model
2023-11-25 22:24:55,554 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (64052, 100) matrix of type float32 from /content/custom_model_lyrics.model', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-11-25T22:24:55.552212', 'gensim': '4.3.2', 'python': '3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]', 'platform': 'Linux-5.15.120+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
2023-11-25 22:24:59,548 - word2vec2tensor - INFO - 2D tensor file saved to /tmp/my_model_prefix_tensor.tsv
2023-11-25 22:24:59,548 - word2vec2tensor - INFO - Tensor metadata file saved to /tmp/my_model_prefix_metadata.tsv
2023-11-25 22:24:59,550 - word2vec2tensor - INFO - finis