In [1]:
# import/install libraries/packages
# !pip install gensim
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import numpy as np

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# initialize a sentiment analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /home/jupyter-
[nltk_data]     cradduhj/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter-
[nltk_data]     cradduhj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# read in final dataset file as CSV
df = pd.read_csv("final_dataset.csv")
df.head()

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,pop,rock,hip-hop,r&b,country,jazz,electronic,reggae,metal,folk
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,1,0,0,0,0,0,1,1,0,0
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0,1,1,0,0,0,1,0,0
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,1,0,0,1,0,0,0,0,0,0
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0,1,1,0,0,0,0,0,0
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0,0,0,0,1,0,0,0,0,0


In [3]:
print(df.columns)
print(df.isna().sum())
print(df.dtypes)
print(df.describe())

Index(['title', 'artist', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'view_count', 'like_count', 'comment_count', 'chart_year',
       'lyrics', 'type_Group', 'type_Person', 'country_CA', 'country_GB',
       'country_Other', 'country_US', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'duration_min', 'begin_year', 'pop', 'rock', 'hip-hop', 'r&b',
       'country', 'jazz', 'electronic', 'reggae', 'metal', 'folk'],
      dtype='object')
title               0
artist              0
acousticness        0
danceability        0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
view_count          0
like_count          0
comment_count       0
chart_year          0
lyrics              2
type_Group          0
type_Person      

In [4]:
# account for missing lyrics
df['lyrics'] = df['lyrics'].fillna('').astype(str)

# conduct sentiment analysis on lyrics
pos_lyrics = []
neg_lyrics = []
neu_lyrics = []
compound_lyrics = []

# get sentiment scores from lyrics
for text in df['lyrics']:
    # analyze the sentiment
    scores_lyrics = sia.polarity_scores(text)

    # pull out scores
    pos_lyrics.append(scores_lyrics['pos'])
    neg_lyrics.append(scores_lyrics['neg'])
    neu_lyrics.append(scores_lyrics['neu'])
    compound_lyrics.append(scores_lyrics['compound'])

# add columns for scores
df['lyrics_pos'] = pos_lyrics
df['lyrics_neg'] = neg_lyrics
df['lyrics_neu'] = neu_lyrics
df['lyrics_compound'] = compound_lyrics

df.head()

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,country,jazz,electronic,reggae,metal,folk,lyrics_pos,lyrics_neg,lyrics_neu,lyrics_compound
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,0,0,1,1,0,0,0.068,0.054,0.879,0.3382
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0,0,1,0,0,0.141,0.054,0.805,0.9967
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,0,0,0,0,0,0,0.105,0.11,0.785,-0.4454
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0,0,0,0,0,0.374,0.021,0.605,0.9996
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,1,0,0,0,0,0,0.45,0.039,0.511,0.9997


In [5]:
# conduct sentiment analysis on titles
pos_titles = []
neg_titles = []
neu_titles = []
compound_titles = []

# get sentiment scores from titles
for text in df['title']:
    # analyze the sentiment
    scores_titles = sia.polarity_scores(text)

    # pull out scores
    pos_titles.append(scores_titles['pos'])
    neg_titles.append(scores_titles['neg'])
    neu_titles.append(scores_titles['neu'])
    compound_titles.append(scores_titles['compound'])

# add columns for scores
df['title_pos'] = pos_titles
df['title_neg'] = neg_titles
df['title_neu'] = neu_titles
df['title_compound'] = compound_titles

df.head()

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,metal,folk,lyrics_pos,lyrics_neg,lyrics_neu,lyrics_compound,title_pos,title_neg,title_neu,title_compound
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,0,0,0.068,0.054,0.879,0.3382,0.0,0.0,1.0,0.0
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0,0.141,0.054,0.805,0.9967,0.0,0.0,1.0,0.0
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,0,0,0.105,0.11,0.785,-0.4454,0.0,0.0,1.0,0.0
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0,0.374,0.021,0.605,0.9996,0.87,0.0,0.13,0.7717
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0,0,0.45,0.039,0.511,0.9997,0.737,0.0,0.263,0.4215


In [6]:
# get sentiment subjectivity
df['lyrics_subjectivity'] = df['lyrics'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

df.head()

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,folk,lyrics_pos,lyrics_neg,lyrics_neu,lyrics_compound,title_pos,title_neg,title_neu,title_compound,lyrics_subjectivity
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,0,0.068,0.054,0.879,0.3382,0.0,0.0,1.0,0.0,0.629464
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0.141,0.054,0.805,0.9967,0.0,0.0,1.0,0.0,0.539551
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,0,0.105,0.11,0.785,-0.4454,0.0,0.0,1.0,0.0,0.561958
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0.374,0.021,0.605,0.9996,0.87,0.0,0.13,0.7717,0.622504
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0,0.45,0.039,0.511,0.9997,0.737,0.0,0.263,0.4215,0.578056


In [7]:
# remove standard stopwords but keep pronouns
stop_words = set(stopwords.words("english")) - {"i","you","me","my","we","us"}

corpus_cleaned = []

# clean lyrics through tokenization
for text in df['lyrics']:
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    corpus_cleaned.append(" ".join(tokens))

In [8]:
# get lexical features including word/char counts, avg. word length, unique words count, and vocab richness
df['word_count'] = df['lyrics'].str.split().apply(len)
df['char_count'] = df['lyrics'].str.len()
df['avg_word_len'] = df['char_count'] / df['word_count']
df['unique_words'] = df['lyrics'].apply(lambda x: len(set(x.split())))
df['vocab_richness'] = df['unique_words'] / df['word_count']
df.head()

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,title_pos,title_neg,title_neu,title_compound,lyrics_subjectivity,word_count,char_count,avg_word_len,unique_words,vocab_richness
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,0.0,0.0,1.0,0.0,0.629464,319,1546,4.846395,98,0.30721
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0.0,0.0,1.0,0.0,0.539551,951,5066,5.327024,428,0.450053
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,0.0,0.0,1.0,0.0,0.561958,485,2456,5.063918,141,0.290722
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0.87,0.0,0.13,0.7717,0.622504,439,2124,4.838269,132,0.300683
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0.737,0.0,0.263,0.4215,0.578056,320,1720,5.375,117,0.365625


In [9]:
# tokenized text
tokenized_lyrics = [text.split() for text in corpus_cleaned]

# dictionary and corpus
dictionary = corpora.Dictionary(tokenized_lyrics)
corpus_gensim = [dictionary.doc2bow(text) for text in tokenized_lyrics]

# LDA with 10 topics
lda = LdaModel(corpus=corpus_gensim, id2word=dictionary, num_topics=10, passes=10, random_state=42)

# topic probabilities per song
song_topics = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus_gensim]
X_lda = np.array([[prob for _, prob in song] for song in song_topics])

# add to dataframe
lda_df = pd.DataFrame(X_lda, columns=[f'Topic_{i+1}' for i in range(X_lda.shape[1])])

In [10]:
# print out topics
lda.print_topics()

[(0,
  '0.141*"love" + 0.039*"i" + 0.035*"ooh" + 0.033*"you" + 0.017*"girl" + 0.017*"got" + 0.016*"ah" + 0.013*"time" + 0.013*"me" + 0.011*"like"'),
 (1,
  '0.109*"i" + 0.047*"you" + 0.028*"we" + 0.019*"my" + 0.012*"time" + 0.012*"never" + 0.012*"me" + 0.011*"one" + 0.010*"know" + 0.009*"away"'),
 (2,
  '0.190*"i" + 0.046*"my" + 0.033*"you" + 0.028*"me" + 0.019*"got" + 0.016*"like" + 0.013*"ca" + 0.011*"know" + 0.009*"ai" + 0.009*"get"'),
 (3,
  '0.025*"jump" + 0.024*"boom" + 0.022*"ha" + 0.019*"dat" + 0.016*"dance" + 0.016*"hot" + 0.015*"clap" + 0.014*"bass" + 0.013*"let" + 0.012*"ay"'),
 (4,
  '0.063*"i" + 0.033*"yeah" + 0.029*"my" + 0.027*"nigga" + 0.021*"you" + 0.020*"bitch" + 0.019*"like" + 0.018*"got" + 0.017*"me" + 0.014*"shit"'),
 (5,
  '0.158*"you" + 0.085*"i" + 0.066*"me" + 0.028*"oh" + 0.026*"my" + 0.022*"baby" + 0.022*"yeah" + 0.019*"know" + 0.015*"like" + 0.014*"let"'),
 (6,
  '0.041*"ayy" + 0.029*"shake" + 0.018*"back" + 0.016*"drop" + 0.016*"body" + 0.015*"run" + 0.015*"

In [11]:
# rename topics
topic_names = {
    1: "Love_Romantic",
    2: "Personal_Relationship",
    3: "Self_Introspective",
    4: "Party_Dance",
    5: "Rap_HipHop",
    6: "Love_Pop",
    7: "Dance_Club",
    8: "Catchy_Hook",
    9: "Spanish_Latin",
    10: "Friendship_Group"
}

# rename columns
lda_df.rename(columns={f"Topic_{i}": topic_names[i] for i in range(1, 11)}, inplace=True)

# add to dataframe
df = pd.concat([df, lda_df], axis=1)

In [12]:
print(df.columns)
df.head()

Index(['title', 'artist', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'view_count', 'like_count', 'comment_count', 'chart_year',
       'lyrics', 'type_Group', 'type_Person', 'country_CA', 'country_GB',
       'country_Other', 'country_US', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'duration_min', 'begin_year', 'pop', 'rock', 'hip-hop', 'r&b',
       'country', 'jazz', 'electronic', 'reggae', 'metal', 'folk',
       'lyrics_pos', 'lyrics_neg', 'lyrics_neu', 'lyrics_compound',
       'title_pos', 'title_neg', 'title_neu', 'title_compound',
       'lyrics_subjectivity', 'word_count', 'char_count', 'avg_word_len',
       'unique_words', 'vocab_richness', 'Love_Romantic',
       'Personal_Relationship', 'Self_Introspective', 'Party_Dance',
       'Rap_HipHop', 'Love_Pop', 'Dance_Club', 'Catchy_Hook', 'Spanish_Latin',
  

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,Love_Romantic,Personal_Relationship,Self_Introspective,Party_Dance,Rap_HipHop,Love_Pop,Dance_Club,Catchy_Hook,Spanish_Latin,Friendship_Group
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,0.000433,0.000433,0.211802,0.000433,0.000433,0.784733,0.000433,0.000433,0.000433,0.000433
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0.050698,0.041704,0.000189,0.000189,0.257439,0.39287,0.006479,0.038306,0.002528,0.209597
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,0.016266,0.352626,0.000327,0.000327,0.000327,0.581893,0.000327,0.047254,0.000327,0.000327
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0.129899,0.106507,0.000361,0.000361,0.000361,0.741092,0.000361,0.020336,0.000361,0.000361
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0.259754,0.09331,0.000483,0.000483,0.000483,0.458359,0.000483,0.053193,0.089151,0.044301


In [13]:
# save dataframe to CSV
df.to_csv("model_ready_dataset.csv", index=False)