In [1]:
from pathlib import Path
import pandas as pd
import nltk
import math
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import re
from dotenv import load_dotenv
from gensim.models import KeyedVectors
import plotly.graph_objects as go
import plotly.express as px
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
load_dotenv()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aszelestey/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aszelestey/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aszelestey/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/aszelestey/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


False

# Textual description of the dataset and its characteristics

I used a dataset (https://www.kaggle.com/datasets/lavagod/radiohead?select=radiohead.csv) containing the lyrics of the songs of the band Radiohead to analyze the semantic meaning of the songs from the perspective of data science. My key interest is to find correlations between the song lyrics and the song albums features.
The dataset contains the following features for each song:
- valence: The valence score of the song (from 0 to 1 how positive or negative the song is)
- duration_ms: The duration of the song in mili-seconds
- lyrics: The lyrics of the song
- album_name: The name of the album the song belongs to
- album_release_year: The year the album was released
- album_img: The image url of the album cover
- pct_sad: Gloom index of the song
- word_count: The number of words in the lyrics
- lyrical_density: Lyric Density is about how close together or far apart the words are over a given tempo. (? source: https://gypsy-folklores.tripod.com/id29.html)

I want to find correlations between the lyrics words' frequencies, distributions and embeddings so I also need to preprocess the lyrics and create a new dataframe with the lyrics words' frequencies, distributions and embeddings. For that I will first tokenize the lyrics.


# Reading dataset and creating a dataframe with the tokenized lyrics


In [2]:
df = pd.read_csv(Path('data/radiohead.csv'), encoding='latin1')
df.head(2)

Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index
0,You,0.305,208667,you are the sun and moon and stars are you and...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0,19,0.091054,50.39
1,Creep,0.096,238640,when you were here before couldn't look you in...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0784,51,0.213711,22.6


In [3]:
df.dtypes

track_name             object
valence               float64
duration_ms             int64
lyrics                 object
album_name             object
album_release_year      int64
album_img              object
pct_sad               float64
word_count              int64
lyrical_density       float64
gloom_index           float64
dtype: object

In [4]:
df.shape


(101, 11)

In [5]:
df[df.duplicated()]


Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index


In [6]:
df.count() 

track_name            101
valence               101
duration_ms           101
lyrics                 98
album_name            101
album_release_year    101
album_img             101
pct_sad               101
word_count            101
lyrical_density       101
gloom_index           101
dtype: int64

In [7]:
# we have 3 words with no lyrics, this make them unusable for the analysis so I will drop them
df_lyrics = df.dropna()
df_lyrics.shape


(98, 11)

# Tokenizing the lyrics


Tokenizing the lyrics and removing stop words

In [8]:
# Removing whitespace
ws_removed = [re.sub(r'\s+', ' ', lyric) for lyric in list(df_lyrics['lyrics'])]

# Tokenizing
tokens = [word_tokenize(lyric) for lyric in ws_removed]

# Stopword removal
stopwords = nltk.corpus.stopwords.words('english')
tokens_no_sw = [[word for word in lyric if word not in stopwords] for lyric in tokens]

# Filtering out words with special characters
tokens_cleaned = [[word for word in lyric if re.match(r'^[a-zA-Z]+$', word)] for lyric in tokens_no_sw]

tokens_cleaned[1][:]


['could',
 'look',
 'eye',
 'like',
 'angel',
 'skin',
 'makes',
 'cry',
 'float',
 'like',
 'feather',
 'beautiful',
 'world',
 'chorus',
 'wish',
 'special',
 'fuckin',
 'special',
 'creep',
 'weirdo',
 'hell',
 'belong',
 'care',
 'hurts',
 'want',
 'control',
 'want',
 'perfect',
 'body',
 'want',
 'perfect',
 'soul',
 'want',
 'notice',
 'around',
 'chorus',
 'fuckin',
 'special',
 'wish',
 'special',
 'creep',
 'weirdo',
 'hell',
 'belong',
 'running',
 'running',
 'run',
 'run',
 'run',
 'run',
 'whatever',
 'makes',
 'happy',
 'whatever',
 'want',
 'fuckin',
 'special',
 'wish',
 'special',
 'creep',
 'weirdo',
 'hell',
 'belong',
 'belong']

I want to reduce the number of irrelevant tokens in the manner of stastistics to the minimal number, but don't want to lose too much information, because I want to predict song based on their word choices I can exclude words that hold low information in the context of the whole corpus. For that goal I will use the TF-IDF weight metric, which is a popular method that orders a score to each word of a document based on the following metric:

$$

\text{TF-IDF}(t, d) = \text{TF}(t, d) \times \text{IDF}(t)
\\[0.5cm]
\\
\text{TF}(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d}
\\[0.5cm]
\\
\text{IDF}(t) = \log \left( \frac{N}{1 + \text{DF}(t)} \right)
\\[0.5cm]
\\
\text{DF}(t) = \text{Number of documents containing the term } t
\\[0.5cm]
\\
where:
\begin{align*}
t & \text{ is the term,} \\
d & \text{ is the document, and} \\
N & \text{ is the total number of documents.}
\end{align*}
$$

In summary this score orders low numbers to words that are not so frequent in the document, but frequent in the whole corpus.

In [9]:
# Calculate TF-IDF scores
# This helps identify words that are particularly significant to specific documents rather than just commonly occurring words in general.
num_docs = len(tokens_cleaned)
tfidf_scores = {}
words = {word for doc in tokens_cleaned for word in doc}

# Calculate document frequency (number of documents containing the word)
doc_freq = {word: sum(1 for doc in tokens_cleaned if word in doc) for word in words}

for doc in tokens_cleaned:
    word_counts = Counter(doc)
    # Calculate term frequency (number of times the word appears in the document)
    tf = {word: word_counts[word] / len(doc) for word in word_counts}
    # Calculate inverse document frequency (log of total docs / doc frequency)
    idf = {word: math.log(num_docs / (1 + doc_freq[word])) for word in tf}
    # TF-IDF = term frequency * inverse document frequency
    tfidf = {word: tf[word] * idf[word] for word in tf}
    tfidf_scores.update(tfidf)

# Convert dictionary to list of tuples and sort
sorted_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)

# 10 words with the highest and lowest TF-IDF scores
print(sorted_tfidf[:10])
print(sorted_tfidf[-10:])

[('everyone', 1.1621183966674873), ('judge', 1.111948656603036), ('slow', 1.0378187461628336), ('arms', 1.0369269700521038), ('ripcord', 1.0152574690723373), ('haunt', 0.943471587420758), ('prove', 0.9369197013970026), ('case', 0.8816070595408526), ('bulletproof', 0.736290326669578), ('hurt', 0.7269711630797003)]
[('singing', 0.020152342138742556), ('walking', 0.017199592868418913), ('could', 0.016833484343408627), ('help', 0.016145711037239982), ('gon', 0.015511150396256746), ('oh', 0.014894048431791287), ('always', 0.013192961766916337), ('never', 0.012423788188754297), ('let', 0.012423788188754297), ('na', 0.012423788188754297)]


In [10]:
# Set TF-IDF threshold, I chose to filter out words with TF-IDF score lower than mean * 0.3
tfidf_threshold = np.mean(list(tfidf_scores.values())) * 0.3

# Filter out low information words based on TF-IDF scores
tokens_relevant = [{word for word in doc if tfidf_scores[word] > tfidf_threshold} for doc in tokens_cleaned]
tokens_irrelevant = [{word for word in doc if tfidf_scores[word] <= tfidf_threshold} for doc in tokens_cleaned]

for i, song_tokens in enumerate(tokens_irrelevant[:5]):
    print(f"Irrelevant tokens for song {df_lyrics.loc[i, 'track_name']}")
    print(tokens_irrelevant[i])
    print(f"Relevant tokens:")
    print(tokens_relevant[i], '\n')

Irrelevant tokens for song You
{'try', 'see', 'things', 'could', 'aaaaaaaaah', 'never'}
Relevant tokens:
{'world', 'end', 'ha', 'sun', 'believe', 'soon', 'going', 'everything', 'hooooooo', 'caught', 'moon', 'away', 'working', 'run', 'stars', 'like', 'fire', 'drowning', 'chaotic'} 

Irrelevant tokens for song Creep
{'chorus', 'could'}
Relevant tokens:
{'fuckin', 'hurts', 'cry', 'feather', 'world', 'creep', 'body', 'special', 'perfect', 'around', 'belong', 'angel', 'running', 'hell', 'makes', 'soul', 'weirdo', 'want', 'look', 'beautiful', 'happy', 'run', 'notice', 'skin', 'control', 'float', 'wish', 'like', 'eye', 'whatever', 'care'} 

Irrelevant tokens for song How Do You?
{'friends', 'always'}
Relevant tokens:
{'respect', 'dangerous', 'powerful', 'twisted', 'freak', 'bitter', 'bigot', 'belong', 'anyway', 'weep', 'knows', 'forget', 'mother', 'steals', 'loved', 'us', 'baby', 'listen', 'stupid', 'turned', 'cheats', 'like', 'lives', 'show', 'daddy', 'wants', 'bullies'} 

Irrelevant tokens 

In [11]:
df_w_tokens = df_lyrics.copy()
df_w_tokens['tokens'] = tokens
df_w_tokens.head(1)

Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index,tokens
0,You,0.305,208667,you are the sun and moon and stars are you and...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0,19,0.091054,50.39,"[you, are, the, sun, and, moon, and, stars, ar..."


In [12]:
# Store modified DataFrame
with open(Path('data/radiohead_extended.csv'), 'wb') as f:
    df_w_tokens.to_csv(f)

# Counting words based on their part of speech, and calculating distribution

I had thought about, what might be relevant aspects to analyze for my purpose before analyzing its embeddings and I though it could be interesting to check the distribution of parts of speech. My theory is that some albums might have more dynamic lyrics tehrefore contain more verbs and some might be more sentimental and might contain more adverbs.

For this goal I use a the pos_tag function from nltk which classifies tokens

In [12]:
noun_freqs = []
verb_freqs = []
adv_freqs = []

for i, song in enumerate(df_w_tokens['tokens']):
    categorizations = pd.DataFrame(pos_tag(song), columns=['word', 'pos'])
    count = len(categorizations)
    noun_freqs.append(categorizations[categorizations['pos'].str.startswith('NN')].size / count)
    verb_freqs.append(categorizations[categorizations['pos'].str.startswith('VB')].size / count)
    adv_freqs.append(categorizations[(categorizations['pos'].str.startswith('JJ')) |
                                     (categorizations['pos'].str.startswith('RB'))].size / count)
    
df_w_tokens['noun_freq'] = noun_freqs
df_w_tokens['verb_freq'] = verb_freqs
df_w_tokens['adverb_freq'] = adv_freqs

df_w_tokens.head()

Unnamed: 0,track_name,valence,duration_ms,lyrics,album_name,album_release_year,album_img,pct_sad,word_count,lyrical_density,gloom_index,tokens,noun_freq,verb_freq,adverb_freq
0,You,0.305,208667,you are the sun and moon and stars are you and...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0,19,0.091054,50.39,"[you, are, the, sun, and, moon, and, stars, ar...",0.390805,0.45977,0.137931
1,Creep,0.096,238640,when you were here before couldn't look you in...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0784,51,0.213711,22.6,"[when, you, were, here, before, could, n't, lo...",0.356021,0.534031,0.52356
2,How Do You?,0.264,132173,he's bitter and twisted he knows what he wants...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0952,21,0.158883,36.56,"[he, 's, bitter, and, twisted, he, knows, what...",0.115385,0.576923,0.134615
3,Stop Whispering,0.279,325627,and the wise man said i don't want to hear you...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0435,46,0.141266,43.48,"[and, the, wise, man, said, i, do, n't, want, ...",0.368794,0.624113,0.312057
4,Thinking About You,0.419,161533,been thinking about you your records are here ...,Pablo Honey,1993,https://i.scdn.co/image/e17011b2aa33289dfa6c08...,0.0,39,0.241437,60.8,"[been, thinking, about, you, your, records, ar...",0.251208,0.608696,0.241546


# Read embeddings of tokens for later analysis

I want to analise the word choices of the songs based on their semantic meaning and turn them into more interpretable data. For that I will turn the tokens into embeddings and reduce their dimensionality. These steps might not be the most rational and efficient way of classifying songs and are more of a experimantational approach.

For the embeddings I will use the  model, which is not considered a state-of-the-art embedding model but is a light-weight and efficient one which is easy to use and does the job.

In [13]:
all_token = {token for tokes in tokens_cleaned for token in tokens}

TypeError: unhashable type: 'list'

In [140]:
# You don't have to execute this section I will include the calculated embeddings in a seperate file
if Path('data/GoogleNews-vectors-negative300.bin.gz') is None:
    pretrained_model_path = Path('data/GoogleNews-vectors-negative300.bin.gz') 
    word_vectors = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)


In [141]:
# We have vectors with dimensionality of 300
word_vectors.loc['angle'].shape

(300,)

In [142]:
vector_store = { word: word_vectors[word] for word in words if word in word_vectors}

In [145]:
# 300 words weren't present in the pretrained embedding set
print(len(words))
print(len(vector_store.keys()))
print(words - vector_store.keys()) # These few words are not a big deal although there are some strange one (for example grey?)

1782
0


In [45]:
# Convert vector_store dictionary to DataFrame
embeddings_df = pd.DataFrame.from_dict(vector_store, orient='index')

# Save to CSV file
embeddings_df.to_csv(Path('data/embeddings.csv'))


In [33]:
with open(Path('data/embeddings.csv'), 'rb') as f:
    word_vectors = pd.read_csv(f, index_col=0)

# Analyzing the fetures of the dataset

In [56]:
# Average word count by year and album
word_no_by_album = df.groupby('album_release_year')['word_count'].mean()

fig = px.bar(word_no_by_album)
fig.show()

In [62]:
# Words by song
fig = px.box(df.sort_values(by='album_release_year'), x='album_release_year', y='word_count')
fig.show()

In [63]:
# Gloom index of songs
fig = px.box(df, x='album_release_year', y='gloom_index')
fig.show()

In [90]:
# Songs by unique tokens
df_w_tokens['unique_tokens_count'] = df_w_tokens['tokens'].apply(lambda x: len(set(x)))
y = df_w_tokens.groupby('album_release_year')['unique_tokens_count'].mean()

fig = go.Figure(
    data=[go.Bar(x=df_w_tokens['album_release_year'].drop_duplicates(), y=y)],
    layout=go.Layout(height=600, width=800)
)
fig.show()

In [96]:
fig = px.box(df_w_tokens, x='album_release_year', y='unique_tokens_count',
             hover_data=['track_name', 'gloom_index'])
fig.show()

In [105]:
fig = go.Figure()
years = df_w_tokens['album_release_year'].drop_duplicates()
album_names = df_w_tokens['album_name'].drop_duplicates()

fig.add_trace(go.Bar(
    y=df_w_tokens.groupby('album_release_year')['word_count'].mean(),
    x=years,
    customdata=album_names,
    name='Mean of word counts',
    hovertemplate='%{y}<br>Album name: %{customdata}<br>Year: %{x}'
))

fig.add_trace(go.Bar(
    y = df_w_tokens.groupby('album_release_year')['unique_tokens_count'].mean(),
    x = years,
    customdata=album_names,
    name='Mean of unique cleaned tokens',
    hovertemplate='%{y}<br>Album name: %{customdata}<br>Year: %{x}'
))

fig.update_layout(
    title='Mean of unique words and total words for each song in each year.',
    barmode='group'
)

fig.show()

In [19]:
fig = go.Figure()
years = df_w_tokens['album_release_year'].drop_duplicates()
album_names = df_w_tokens['album_name'].drop_duplicates()
word_counts = df_w_tokens.groupby('album_release_year')['word_count']

fig.add_trace(go.Scatter(
    y=df_w_tokens.groupby('album_release_year')['noun_freq'].mean(),
    x=years,
    customdata=album_names,
    name='Mean of frequencies of nouns',
    hovertemplate='%{y}<br>Album name: %{customdata}<br>Year: %{x}'
))

fig.add_trace(go.Scatter(
    y = df_w_tokens.groupby('album_release_year')['verb_freq'].mean(),
    x = years,
    customdata=album_names,
    name='Mean of frequences of verbs',
    hovertemplate='%{y}<br>Album name: %{customdata}<br>Year: %{x}'
))

fig.add_trace(go.Scatter(
    y = df_w_tokens.groupby('album_release_year')['adverb_freq'].mean(),
    x = years,
    customdata=album_names,
    name='Mean of frequences of adjectives / adverbs',
    hovertemplate='%{y}<br>Album name: %{customdata}<br>Year: %{x}'
))

fig.update_layout(
    title='Mean of the main part of speech frequences in each album.',
    barmode='group'
)

fig.show()  

In [28]:
c = df_w_tokens.select_dtypes(include=['float64', 'int64']).corr()
px.imshow(c, text_auto=True)

In [29]:
# Define bins for the decades
bins = [1990, 2000, 2010, 2020]
labels = ['90s', '00s', '10s']
df_w_tokens['decade'] = pd.cut(df_w_tokens['album_release_year'], bins=bins, labels=labels)

# Dummy encoding to measure correlations
dummy = pd.get_dummies(df_w_tokens['decade'])
df_encoded = pd.concat([df_w_tokens, dummy], axis=1)