In [2]:
from pathlib import Path
import pandas as pd
import nltk
import math
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter
import re
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Szelestey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Szelestey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Textual description of the dataset and its characteristics

I used a dataset (https://www.kaggle.com/datasets/lavagod/radiohead?select=radiohead.csv) containing the lyrics of the songs of the band Radiohead to analyze the semantic meaning of the songs from the perspective of data science. My key interest is to find correlations between the song lyrics and the song albums features.
The dataset contains the following features for each song:
- valence: The valence score of the song (from 0 to 1 how positive or negative the song is)
- duration_ms: The duration of the song in mili-seconds
- lyrics: The lyrics of the song
- album_name: The name of the album the song belongs to
- album_release_year: The year the album was released
- album_img: The image url of the album cover
- pct_sad: Gloom index of the song
- word_count: The number of words in the lyrics
- lyrical_density: Lyric Density is about how close together or far apart the words are over a given tempo. (? source: https://gypsy-folklores.tripod.com/id29.html)

I want to find correlations between the lyrics words' frequencies, distributions and embeddings so I also need to preprocess the lyrics and create a new dataframe with the lyrics words' frequencies, distributions and embeddings. For that I will first tokenize the lyrics.


# Reading dataset and creating a dataframe with the tokenized lyrics


In [12]:
df = pd.read_csv(Path('data/radiohead.csv'), encoding='latin1')
df.head(2)

Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index
0,You,0.305,208667,you are the sun and moon and stars are you and...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0,19,0.091054,50.39
1,Creep,0.096,238640,when you were here before couldn't look you in...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0784,51,0.213711,22.6


In [13]:
df.dtypes

track_name             object
valence               float64
duration_ms             int64
lyrics                 object
album_name             object
album_release_year      int64
album_img              object
pct_sad               float64
word_count              int64
lyrical_density       float64
gloom_index           float64
dtype: object

In [14]:
df.shape


(101, 11)

In [15]:
df[df.duplicated()]


Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index


In [16]:
df.count() 

track_name            101
valence               101
duration_ms           101
lyrics                 98
album_name            101
album_release_year    101
album_img             101
pct_sad               101
word_count            101
lyrical_density       101
gloom_index           101
dtype: int64

In [17]:
# we have 3 words with no lyrics, this make them unusable for the analysis so I will drop them
df_lyrics = df.dropna()
df_lyrics.shape


(98, 11)

# Tokenizing the lyrics


Tokenizing the lyrics and removing stop words

In [18]:
# Removing whitespace
ws_removed = [re.sub(r'\s+', ' ', lyric) for lyric in list(df_lyrics['lyrics'])]

# Tokenizing
tokens = [word_tokenize(lyric) for lyric in ws_removed]

# Stopword removal
stopwords = nltk.corpus.stopwords.words('english')
tokens_no_sw = [[word for word in lyric if word not in stopwords] for lyric in tokens]

# Filtering out words with special characters
tokens_cleaned = [[word for word in lyric if re.match(r'^[a-zA-Z]+$', word)] for lyric in tokens_no_sw]

tokens_cleaned[1][:20]


['could',
 'look',
 'eye',
 'like',
 'angel',
 'skin',
 'makes',
 'cry',
 'float',
 'like',
 'feather',
 'beautiful',
 'world',
 'chorus',
 'wish',
 'special',
 'fuckin',
 'special',
 'creep',
 'weirdo']

Filtering Low-Information words

In [19]:
# Calculate TF-IDF scores
# This helps identify words that are particularly significant to specific documents rather than just commonly occurring words in general.
num_docs = len(tokens_cleaned)
tfidf_scores = {}
words = {word for doc in tokens_cleaned for word in doc}

# Calculate document frequency (number of documents containing the word)
doc_freq = {word: sum(1 for doc in tokens_cleaned if word in doc) for word in words}

for doc in tokens_cleaned:
    word_counts = Counter(doc)
    # Calculate term frequency (number of times the word appears in the document)
    tf = {word: word_counts[word] / len(doc) for word in word_counts}
    # Calculate inverse document frequency (log of total docs / doc frequency)
    idf = {word: math.log(num_docs / doc_freq[word]) for word in tf}
    # TF-IDF = term frequency * inverse document frequency
    tfidf = {word: tf[word] * idf[word] for word in tf}
    tfidf_scores.update(tfidf)

# Convert dictionary to list of tuples and sort
sorted_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)

# 10 words with the highest TF-IDF scores
sorted_tfidf[:10]

[('judge', 1.309990708191592),
 ('everyone', 1.297273432703542),
 ('slow', 1.2226579943121525),
 ('ripcord', 1.1960784726966711),
 ('arms', 1.1146891167222073),
 ('haunt', 1.1115072675565023),
 ('prove', 1.10378846708736),
 ('case', 0.9841384661888941),
 ('bulletproof', 0.8674262797484867),
 ('uptight', 0.8490680516056615)]

In [21]:
# Set TF-IDF threshold, I chose to filter out words with TF-IDF score lower than mean * 0.3
tfidf_threshold = np.mean(list(tfidf_scores.values())) * 0.3

# Filter out low information words based on TF-IDF scores
tokens_relevant = [[word for word in doc if tfidf_scores[word] > tfidf_threshold] for doc in tokens_cleaned]
tokens_irrelevant = [[word for word in doc if tfidf_scores[word] <= tfidf_threshold] for doc in tokens_cleaned]

tokens_irrelevant[1][:]

['could', 'like', 'like', 'chorus', 'hurts', 'around', 'chorus']

In [22]:
df_w_tokens = df_lyrics.copy()
df_w_tokens['tokens'] = tokens
df_w_tokens.head(1)

Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index,tokens
0,You,0.305,208667,you are the sun and moon and stars are you and...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0,19,0.091054,50.39,"[you, are, the, sun, and, moon, and, stars, ar..."


# Read embeddings for tokens then reduce dimension

I want to analise the word choices of the songs based on their semantic meaning and turn them into more interpretable data. For that I will turn the tokens into embeddings and reduce their dimensionality. These steps might not be the most rational and efficient way of classifying songs and are more of a experimantational approach.

For the embeddings I will use the  model, which is not considered a state-of-the-art embedding model but is a light-weight and efficient one which is easy to use and does the job.

In [24]:
# You don't have to execute this section I will include the calculated embeddings in a seperate file
if pretrained_model_path is None:
    pretrained_model_path = Path('data/GoogleNews-vectors-negative300.bin.gz') 
    word_vectors = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)


In [32]:
# We have vectors with dimensionality of 300
word_vectors['angle'].shape

(300,)

In [40]:
vector_store = { word: word_vectors[word] for word in words if word in word_vectors}

In [42]:
# 300 words weren't present in the pretrained embedding set
print(len(words))
print(len(vector_store.keys()))
print(words - vector_store.keys()) # ezekért tényleg nem kár, mondjuk a grey az felettébb érdekes, hogy nem szerepel

1782
1755
{'ooooohh', 'efil', 'skwerking', 'hooooooo', 'touchall', 'mmmhm', 'favours', 'lundy', 'liffey', 'godrich', 'gnihtyreve', 'belisha', 'grey', 'mephistopheles', 'backdrifting', 'headshrinkers', 'colours', 'favourite', 'fastnet', 'aeroplane', 'stepford', 'ohhhhhhhhhhh', 'backdrifters', 'aaaaaaaaah', 'ansaphone', 'mould', 'hammerheaded'}


In [45]:
# Convert vector_store dictionary to DataFrame
embeddings_df = pd.DataFrame.from_dict(vector_store, orient='index')

# Save to CSV file
embeddings_df.to_csv(Path('data/embeddings.csv'))
