# RETRIVING SONG LYRICS

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

song_list_df = pd.read_csv('song_list.csv')

In [2]:
def process_lyrics_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    songs = content.split('Title: ')[1:]
    lyrics_list = []
    documents = []  #list for MongoDB documents

    for song in songs:
        lines = song.split('\n')
        title = lines[0]
        artist = lines[1].replace('Artist: ', '')
        lyrics = '\n'.join(lines[3:])

        lyrics_list.append(lyrics)

        documents.append({'title': title, 'artist': artist, 'lyrics': lyrics})

        # Optional: Write to a new file
        with open('processed_lyrics.txt', 'a', encoding='utf-8') as out_file:
            out_file.write(f"Title: {title}\nArtist: {artist}\nLyrics:\n{lyrics}\n\n")

    return lyrics_list, documents


# Process the file and get the list of lyrics and the list of documents for MongoDB
lyrics_list, documents = process_lyrics_file('song_lyrics.txt')
print(lyrics_list)

["We were good, we were gold\nKinda dream that can't be sold\nWe were right 'til we weren't\nBuilt a home and watched it burn\n\nMm, I didn't wanna leave you\nI didn't wanna lie\nStarted to cry, but then remembered I\n\nI can buy myself flowers\nWrite my name in the sand\nTalk to myself for hours\nSay things you don't understand\nI can take myself dancing\nAnd I can hold my own hand\nYeah, I can love me better than you can\n\nCan love me better\nI can love me better, baby\nCan love me better\nI can love me better, baby\n\nPaint my nails cherry red\nMatch the roses that you left\nNo remorse, no regret\nI forgive every word you said\n\nOoh, I didn't wanna leave you, baby\nI didn't wanna fight\nStarted to cry, but then remembered I\n\nI can buy myself flowers\nWrite my name in the sand\nTalk to myself for hours, yeah\nSay things you don't understand\nI can take myself dancing, yeah\nI can hold my own hand\nYeah, I can love me better than you can\n\nCan love me better\nI can love me better

#Translate songs

In [3]:
from googletrans import Translator, LANGUAGES

translator = Translator()


def translate_to_english(text):
    # Detect the language of the text
    detected_language = translator.detect(text).lang

    # If the text is already in English, return as is
    if detected_language == 'en':
        return text

    # Translate the text to English
    translated_text = translator.translate(text, dest='en').text
    return translated_text


# Translate non-English lyrics to English
translated_lyrics = [translate_to_english(lyric) for lyric in lyrics_list]

# Result
print(translated_lyrics[4])


Armed link and pen weight - she dances alone
Compa, what do you think that morra?
The one who is dancing alone I like pa \ 'me
Bella, she knows that she is good
That everyone is looking at her
I approach and throw a whole verb
We take drinks without peros, only temptation

I told
"I'm going to conquer your family, which in a few days you will be mine"
Told me
I'm very crazy but he likes it
That no vato like me acts

And there you go, mikha
And pure double P, old
Thus Nomas, Compa Pedro
Pure armed link
Pa \ 'the plebit
Cha-Chau!

I am not a vato that has Varo
But speaking of the heart, I fulfill everything
He grabbed me catchy from his hand
My company did not even believe it, that when I passed it was me

Her body
I swear for God who was so perfect
Your belt as a model
His eyes
From the beginning they fell in love
She liked her and I like it


In [4]:
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Set of English stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eljas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eljas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eljas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\eljas\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def preprocess_and_remove_stopwords(text):
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove punctuation, stopwords and apply lemmatization
    filtered_words = [
        lemmatizer.lemmatize(word) for word in words
        if word not in stop_words and word.isalpha()
    ]

    return filtered_words


In [6]:
from collections import Counter

# Flatten the list of lists and preprocess each lyric
all_words = []
for lyric in translated_lyrics:
    all_words.extend(preprocess_and_remove_stopwords(lyric))

# Count the frequency of each word
word_freq = Counter(all_words)

# Get the most common words
most_common_words = word_freq.most_common()

# Display the most common words
print(most_common_words)


[('oh', 163), ('know', 150), ('baby', 143), ('love', 134), ('na', 132), ('like', 115), ('yeah', 106), ('night', 102), ('good', 92), ('wan', 89), ('see', 83), ('want', 77), ('one', 74), ('time', 67), ('take', 60), ('got', 60), ('get', 60), ('tell', 58), ('let', 57), ('said', 56), ('go', 53), ('feel', 53), ('never', 52), ('ai', 49), ('ca', 48), ('gon', 47), ('make', 47), ('last', 45), ('way', 44), ('back', 43), ('look', 42), ('say', 41), ('come', 41), ('another', 40), ('better', 39), ('would', 37), ('okay', 37), ('pa', 36), ('whoa', 35), ('tonight', 34), ('right', 32), ('hey', 31), ('feeling', 31), ('heart', 30), ('worried', 30), ('think', 29), ('give', 29), ('daddy', 28), ('god', 28), ('hold', 27), ('going', 27), ('need', 27), ('u', 26), ('keep', 26), ('bitch', 26), ('everything', 25), ('day', 25), ('always', 25), ('put', 25), ('left', 24), ('cry', 22), ('body', 22), ('face', 22), ('light', 22), ('thing', 21), ('life', 21), ('bad', 21), ('used', 21), ('girl', 21), ('ah', 21), ('lie', 20

# SENTIMENTAL ANALYSIS

In [7]:
from textblob import TextBlob


def analyze_sentiment(lyric):
    analysis = TextBlob(lyric)
    return analysis.sentiment


# Analyze the sentiment of each song
sentiment_results = [analyze_sentiment(lyric) for lyric in translated_lyrics]

# Display the sentiment analysis results
data_polarity = []
data_subjectivity = []

for sentiment in sentiment_results:
    # print(f"Song {i + 1}: Polarity = {sentiment.polarity}, Subjectivity = {sentiment.subjectivity}")
    data_polarity.append(sentiment.polarity)
    data_subjectivity.append(sentiment.subjectivity)

song_list_df['Polarity'] = data_polarity
song_list_df['Subjectivity'] = data_subjectivity


In [8]:
song_list_df

Unnamed: 0,title,artist,Polarity,Subjectivity
0,Flowers,Miley Cyrus,0.485994,0.555602
1,Kill Bill,SZA,0.20948,0.364384
2,As It Was,Harry Styles,0.053846,0.216026
3,Seven,Jung Kook,0.7,0.6
4,Ella Baila Sola,Eslabon Armado,0.216845,0.558333
5,Cruel Summer,Taylor Swift,-0.104993,0.559264
6,Creepin,Metro Boomin,0.165833,0.3525
7,Calm Down,Rema,0.051976,0.483127
8,"Shakira: Bzrp Music Sessions, Vol. 53",Bizarrap,0.112083,0.499722
9,Anti-Hero,Taylor Swift,0.039394,0.412121


In [9]:
# Display the results
print(f"Average Polarity: {song_list_df['Polarity'].mean()}")  #-1 super negative, 1-positive
print(f"Average Subjectivity: {song_list_df['Subjectivity'].mean()}")  #0 very objective, 1 - very subjective


Average Polarity: 0.13100412292720068
Average Subjectivity: 0.48873063729257266


In [10]:
# import pandas as pd
print(sentiment_results[0])

Sentiment(polarity=0.48599439775910364, subjectivity=0.5556022408963588)


# Topic analysis (Latent Dirichlet Allocation (LDA))

In [11]:
import gensim
from gensim import corpora
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [12]:
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())

    # Filter out stopwords and non-alphabetic words, and lemmatize the remaining words
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]

    return filtered_words

In [13]:

# Assuming translated_lyrics is a list of songs
documents = [preprocess(lyric) for lyric in translated_lyrics]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(documents)

# Convert dictionary to a bag-of-words corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Number of topics
num_topics = 5

# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Number of words to display per topic
words_per_topic = 2

# Display the topics with limited number of words
for idx, topic in lda_model.show_topics(num_topics=num_topics, num_words=words_per_topic, formatted=False):
    topic_words = ", ".join([word for word, _ in topic])
    print(f"Topic {idx}: {topic_words}")



Topic 0: whoa, love
Topic 1: baby, know
Topic 2: na, wan
Topic 3: like, know
Topic 4: oh, okay


# Text modeling (GloVe)

In [14]:
import gensim.downloader as api

# Load a pre-trained Word2Vec model (small model for demonstration)
word_vectors = api.load("glove-wiki-gigaword-50")


In [15]:
all_words

['good',
 'gold',
 'kinda',
 'dream',
 'ca',
 'sold',
 'right',
 'built',
 'home',
 'watched',
 'burn',
 'mm',
 'wan',
 'na',
 'leave',
 'wan',
 'na',
 'lie',
 'started',
 'cry',
 'remembered',
 'buy',
 'flower',
 'write',
 'name',
 'sand',
 'talk',
 'hour',
 'say',
 'thing',
 'understand',
 'take',
 'dancing',
 'hold',
 'hand',
 'yeah',
 'love',
 'better',
 'love',
 'better',
 'love',
 'better',
 'baby',
 'love',
 'better',
 'love',
 'better',
 'baby',
 'paint',
 'nail',
 'cherry',
 'red',
 'match',
 'rose',
 'left',
 'remorse',
 'regret',
 'forgive',
 'every',
 'word',
 'said',
 'ooh',
 'wan',
 'na',
 'leave',
 'baby',
 'wan',
 'na',
 'fight',
 'started',
 'cry',
 'remembered',
 'buy',
 'flower',
 'write',
 'name',
 'sand',
 'talk',
 'hour',
 'yeah',
 'say',
 'thing',
 'understand',
 'take',
 'dancing',
 'yeah',
 'hold',
 'hand',
 'yeah',
 'love',
 'better',
 'love',
 'better',
 'love',
 'better',
 'baby',
 'love',
 'better',
 'love',
 'better',
 'baby',
 'love',
 'better',
 'love',


In [16]:
import numpy as np
from sklearn.cluster import KMeans

# Assuming 'all_words' is your list of words
word_vecs = []
for word in all_words:
    if word in word_vectors:
        word_vecs.append(word_vectors[word])

# Convert to NumPy array for clustering
word_vecs = np.array(word_vecs)

# Number of clusters (topics)
num_clusters = 5

# Clustering with KMeans
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(word_vecs)

# Get cluster assignments for words
clusters = kmeans.labels_

centroids = kmeans.cluster_centers_


In [17]:
from scipy.spatial import distance


# Assuming 'all_words' contains words that have been used in the clustering
# 'word_vecs' contains the corresponding vectors
# 'clusters' contains the cluster assignment for each word in 'all_words'

def find_representative_word(cluster_id, word_vecs, cluster_center, words):
    min_dist = float('inf')
    representative_word = None
    for word, vec in zip(words, word_vecs):
        dist = distance.euclidean(vec, cluster_center)
        if dist < min_dist:
            min_dist = dist
            representative_word = word
    return representative_word


representative_words = []
for i, centroid in enumerate(centroids):
    # Find the representative word for each cluster
    representative_word = find_representative_word(i, word_vecs, centroid, all_words)
    representative_words.append(representative_word)

# Display representative word for each cluster
for i, word in enumerate(representative_words):
    print(f"Topic {i + 1}: {word}")


Topic 1: say
Topic 2: shot
Topic 3: true
Topic 4: never
Topic 5: na


# One topic one song

In [18]:
def vectorize_song(song_lyrics, model):
    vecs = []
    for word in preprocess(song_lyrics):
        if word in model:
            vecs.append(model[word])
    return vecs


from sklearn.cluster import KMeans
from scipy.spatial import distance


def find_dominant_topic(word_vecs, words):
    if not word_vecs:
        return "No dominant topic"

    # Apply KMeans with a small number of clusters
    n_init_value = 10
    kmeans = KMeans(n_clusters=1, random_state=0, n_init=n_init_value).fit(word_vecs)
    centroid = kmeans.cluster_centers_[0]

    # Find the word closest to the centroid
    min_dist = float('inf')
    representative_word = None
    for word, vec in zip(words, word_vecs):
        dist = distance.euclidean(vec, centroid)
        if dist < min_dist:
            min_dist = dist
            representative_word = word

    return representative_word


# Process each song
topics_per_song = []

for song in translated_lyrics:
    preprocessed_words = preprocess_and_remove_stopwords(song)
    song_vecs = [word_vectors[word] for word in preprocessed_words if word in word_vectors]
    dominant_topic = find_dominant_topic(song_vecs, preprocessed_words)
    topics_per_song.append(dominant_topic)

song_list_df['Topic'] = topics_per_song

In [19]:
def analyze_repetition_and_hooks(lyrics_list):
    hooks_analysis = []

    for song in lyrics_list:
        # Split the song into lines
        lines = song.split('\n')
        line_count = {}

        # Count the occurrence of each line
        for line in lines:
            if line.strip() == "":  # Skip empty lines
                continue
            line_count.setdefault(line, 0)
            line_count[line] += 1

        # Find lines with highest repetition (potential hooks)
        sorted_lines = sorted(line_count.items(), key=lambda x: x[1], reverse=True)
        potential_hooks = [line for line, count in sorted_lines if count > 1]  # Threshold of more than 1 repetition

        # Calculate the proportion of hooks
        total_lines = len([line for line in lines if line.strip() != ""])
        hook_lines = sum(count for line, count in sorted_lines if count > 1 and line.strip() != "")
        proportion_of_hooks = hook_lines / total_lines if total_lines > 0 else 0

        hooks_analysis.append({
            'potential_hooks': potential_hooks,
            'proportion_of_hooks': proportion_of_hooks
        })

    return hooks_analysis


# Apply the analysis
repetition_and_hooks = analyze_repetition_and_hooks(translated_lyrics)

# Display results
for i, song_analysis in enumerate(repetition_and_hooks):
    print(f"Song {i + 1} potential hooks:")
    for hook in song_analysis['potential_hooks']:
        print(f"   {hook}")
    print(f"Proportion of hooks in the song: {song_analysis['proportion_of_hooks']:.2f}")


Song 1 potential hooks:
   Can love me better
   I can love me better, baby
   Started to cry, but then remembered I
   Yeah, I can love me better than you can
   I can buy myself flowers
   Write my name in the sand
   Say things you don't understand
   I didn't wanna fight
   I can take myself dancing, yeah
   I can hold my own hand
Proportion of hooks in the song: 0.58
Song 2 potential hooks:
   I might kill my ex
   How\'d I get here?
   I\'m so mature, I\'m so mature
   I\'m so mature I got me a therapist to tell me there\'s other men
   I don\'t want none, I just want you
   Not the best idea
   His new girlfriend\'s next
   I still love him though
   Rather be in jail than alone
   I did it all for love (love)
   I did all of this sober
   I just killed my ex
Proportion of hooks in the song: 0.54
Song 3 potential hooks:
   You know it\'s not the same as it was
   In this world, it\'s just us
   As it was, as it was
   You know it\'s not the same
Proportion of hooks in the song: 

In [33]:
song_list_df.to_csv('analysis_results.csv', index=False)

In [36]:
song_list_df

Unnamed: 0,title,artist,Polarity,Subjectivity,Topic
0,Flowers,Miley Cyrus,0.485994,0.555602,good
1,Kill Bill,SZA,0.20948,0.364384,though
2,As It Was,Harry Styles,0.053846,0.216026,way
3,Seven,Jung Kook,0.7,0.6,got
4,Ella Baila Sola,Eslabon Armado,0.216845,0.558333,alone
5,Cruel Summer,Taylor Swift,-0.104993,0.559264,coming
6,Creepin,Metro Boomin,0.165833,0.3525,thing
7,Calm Down,Rema,0.051976,0.483127,come
8,"Shakira: Bzrp Music Sessions, Vol. 53",Bizarrap,0.112083,0.499722,well
9,Anti-Hero,Taylor Swift,0.039394,0.412121,come
