In [2]:
import pandas as pd
import json
import string
import ast
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords

import data_collection

## Billboard Hot 100 chart positions

Up-to-date data about Billboard Hot 100 tracks is located in the **all.json** file.

In [4]:
# Load the JSON file
with open('data/all.json', 'r') as file:
    json_data = json.load(file)

# Normalize the nested 'data' field
hot_100 = pd.json_normalize(
    json_data, 
    record_path='data', 
    meta=['date'], 
    errors='ignore'
)

In [5]:
hot_100

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date
0,Poor Little Fool,Ricky Nelson,1,,1,1,1958-08-04
1,Patricia,Perez Prado And His Orchestra,2,,2,1,1958-08-04
2,Splish Splash,Bobby Darin,3,,3,1,1958-08-04
3,Hard Headed Woman,Elvis Presley With The Jordanaires,4,,4,1,1958-08-04
4,When,Kalin Twins,5,,5,1,1958-08-04
...,...,...,...,...,...,...,...
344582,Alibi,"Sevdaliza, Pabllo Vittar & Yseult",96,98.0,95,3,2024-08-17
344583,Parking Lot,Mustard & Travis Scott,97,81.0,57,4,2024-08-17
344584,Wine Into Whiskey,Tucker Wetmore,98,95.0,68,19,2024-08-17
344585,"Love You, Miss You, Mean It",Luke Bryan,99,,99,1,2024-08-17


In [6]:
hot_100.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344587 entries, 0 to 344586
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   song            344587 non-null  object 
 1   artist          344587 non-null  object 
 2   this_week       344587 non-null  int64  
 3   last_week       310009 non-null  float64
 4   peak_position   344587 non-null  int64  
 5   weeks_on_chart  344587 non-null  int64  
 6   date            344587 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory usage: 18.4+ MB


In [7]:
#Turn the date to datetime type
hot_100.date = pd.to_datetime(hot_100.date)

Currently the dataset contains information about each week's 100 songs. This leads to a lot of duplicate values, because most songs chart for multiple weeks. For the pursposes of the project I want to only have each song once in the dataset, taking its peak position and its weeks on chart. To do so, I will sort the dataframe by the date and current week's position and then keep only the first occurance of each song, dropping all the rest.

Before I do that, I want to make a new column that will contain the year in which the song first entered the chart (most often the release year).

In [9]:
# Group by both 'artist' and 'song' and find the first date the song entered the chart
first_entry = hot_100.groupby(['artist', 'song'])['date'].min()
first_entry_year = first_entry.dt.year

# Map the first entry year back to the original DataFrame
hot_100['year'] = hot_100.set_index(['artist', 'song']).index.map(first_entry_year)

In [10]:
hot_100 = hot_100.sort_values(
    by = ['date', 'this_week'], ascending = [False, True]
    ).drop_duplicates(subset = ['song', 'artist'], keep = 'first'
    ).reset_index(drop = True)

In [11]:
hot_100

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date,year
0,A Bar Song (Tipsy),Shaboozey,1,1.0,1,17,2024-08-17,2024
1,I Had Some Help,Post Malone Featuring Morgan Wallen,2,2.0,1,13,2024-08-17,2024
2,Not Like Us,Kendrick Lamar,3,3.0,1,14,2024-08-17,2024
3,Espresso,Sabrina Carpenter,4,4.0,3,17,2024-08-17,2024
4,Million Dollar Baby,Tommy Richman,5,5.0,2,15,2024-08-17,2024
...,...,...,...,...,...,...,...,...
31442,Stay,The Ames Brothers,90,,90,1,1958-08-04,1958
31443,Over And Over,Thurston Harris,96,,96,1,1958-08-04,1958
31444,Little Serenade,The Ames Brothers,98,,98,1,1958-08-04,1958
31445,I'll Get By (As Long As I Have You),Billy Williams,99,,99,1,1958-08-04,1958


## Spotify Features

The next dataset contains spotify audio features for songs.

In [13]:
spotify_features = pd.read_csv("data/Hot 100 Audio Features.csv")

In [14]:
spotify_features

Unnamed: 0,index,SongID,Performer,Song,spotify_genre,spotify_track_id,spotify_track_preview_url,spotify_track_duration_ms,spotify_track_explicit,spotify_track_album,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity
0,0,-twistin'-White Silver SandsBill Black's Combo,Bill Black's Combo,-twistin'-White Silver Sands,[],,,,,,...,,,,,,,,,,
1,1,¿Dònde Està Santa Claus? (Where Is Santa Claus...,Augie Rios,¿Dònde Està Santa Claus? (Where Is Santa Claus?),['novelty'],,,,,,...,,,,,,,,,,
2,2,......And Roses And RosesAndy Williams,Andy Williams,......And Roses And Roses,"['adult standards', 'brill building pop', 'eas...",3tvqPPpXyIgKrm4PR9HCf0,https://p.scdn.co/mp3-preview/cef4883cfd1e0e53...,166106.0,False,The Essential Andy Williams,...,-14.063,1.0,0.0315,0.91100,0.000267,0.1120,0.150,83.969,4.0,38.0
3,3,...And Then There Were DrumsSandy Nelson,Sandy Nelson,...And Then There Were Drums,"['rock-and-roll', 'space age pop', 'surf music']",1fHHq3qHU8wpRKHzhojZ4a,,172066.0,False,Compelling Percussion,...,-17.278,0.0,0.0361,0.00256,0.745000,0.1450,0.801,121.962,4.0,11.0
4,4,...Baby One More TimeBritney Spears,Britney Spears,...Baby One More Time,"['dance pop', 'pop', 'post-teen pop']",3MjUtNVVq3C8Fn0MP3zhXa,https://p.scdn.co/mp3-preview/da2134a161f1cb34...,211066.0,False,...Baby One More Time (Digital Deluxe Version),...,-5.745,0.0,0.0307,0.20200,0.000131,0.4430,0.907,92.960,4.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29498,29498,Zoo YorkLil Tjay Featuring Fivio Foreign & Pop...,Lil Tjay Featuring Fivio Foreign & Pop Smoke,Zoo York,,,,,,,...,,,,,,,,,,
29499,29499,ZoomFuture,Future,Zoom,"['atl hip hop', 'hip hop', 'pop rap', 'rap', '...",2IG6Te7JyvrtqhFeOF7le4,https://p.scdn.co/mp3-preview/cb8fde6edc08e70a...,278429.0,True,FUTURE,...,-7.673,1.0,0.4260,0.01450,0.000000,0.2630,0.627,150.945,4.0,51.0
29500,29500,ZoomLil' Boosie Featuring Yung Joc,Lil' Boosie Featuring Yung Joc,Zoom,"['baton rouge rap', 'deep southern trap']",,,,,,...,,,,,,,,,,
29501,29501,Zorba The GreekHerb Alpert & The Tijuana Brass,Herb Alpert & The Tijuana Brass,Zorba The Greek,"['adult standards', 'easy listening', 'lounge']",3WLEVNohakzZmMpN5W7mHK,https://p.scdn.co/mp3-preview/1841a4034ba42fc0...,264853.0,False,!!!Going Places!!!,...,-12.702,1.0,0.3230,0.15400,0.279000,0.0584,0.192,82.107,4.0,35.0


The two datasets will be merged based on matches in song titles and artists. There are missing values for the spotify features, which will be handled at a later stage.

In [16]:
merged_data = pd.merge(left = hot_100, right = spotify_features, left_on = ['artist', 'song'], right_on = ['Performer', 'Song'], how = 'left')
merged_data = merged_data.drop_duplicates(subset = ['Performer', 'Song', 'artist', 'song'])

In [17]:
merged_data

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date,year,index,SongID,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity
0,A Bar Song (Tipsy),Shaboozey,1,1.0,1,17,2024-08-17,2024,,,...,,,,,,,,,,
1,I Had Some Help,Post Malone Featuring Morgan Wallen,2,2.0,1,13,2024-08-17,2024,,,...,,,,,,,,,,
2,Not Like Us,Kendrick Lamar,3,3.0,1,14,2024-08-17,2024,,,...,,,,,,,,,,
3,Espresso,Sabrina Carpenter,4,4.0,3,17,2024-08-17,2024,,,...,,,,,,,,,,
4,Million Dollar Baby,Tommy Richman,5,5.0,2,15,2024-08-17,2024,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31558,Stay,The Ames Brothers,90,,90,1,1958-08-04,1958,22687.0,StayThe Ames Brothers,...,,,,,,,,,,
31559,Over And Over,Thurston Harris,96,,96,1,1958-08-04,1958,18609.0,Over And OverThurston Harris,...,,,,,,,,,,
31560,Little Serenade,The Ames Brothers,98,,98,1,1958-08-04,1958,14831.0,Little SerenadeThe Ames Brothers,...,,,,,,,,,,
31561,I'll Get By (As Long As I Have You),Billy Williams,99,,99,1,1958-08-04,1958,11767.0,I'll Get By (As Long As I Have You)Billy Williams,...,,,,,,,,,,


In [18]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31447 entries, 0 to 31562
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   song                       31447 non-null  object        
 1   artist                     31447 non-null  object        
 2   this_week                  31447 non-null  int64         
 3   last_week                  27499 non-null  float64       
 4   peak_position              31447 non-null  int64         
 5   weeks_on_chart             31447 non-null  int64         
 6   date                       31447 non-null  datetime64[ns]
 7   year                       31447 non-null  int32         
 8   index                      29181 non-null  float64       
 9   SongID                     29181 non-null  object        
 10  Performer                  29181 non-null  object        
 11  Song                       29181 non-null  object        
 12  spotify_g

In [19]:
merged_data.columns

Index(['song', 'artist', 'this_week', 'last_week', 'peak_position',
       'weeks_on_chart', 'date', 'year', 'index', 'SongID', 'Performer',
       'Song', 'spotify_genre', 'spotify_track_id',
       'spotify_track_preview_url', 'spotify_track_duration_ms',
       'spotify_track_explicit', 'spotify_track_album', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'spotify_track_popularity'],
      dtype='object')

There aren't available lyrics for all the songs, only for about 2/3 of them. The final dataset has a LOT of features. Some of them are duplicates and some won't be needed so lets drop them.

In [21]:
merged_data = merged_data.drop([
    'index', 'Performer', 'Song', 'SongID', 'spotify_track_id',
    'spotify_track_preview_url', 'spotify_track_album', 
    'time_signature','last_week',
], axis = 1)

In [22]:
merged_data = merged_data.drop_duplicates(subset = ['artist', 'song'])
merged_data.spotify_track_explicit = merged_data.spotify_track_explicit.astype('category') #This column contains only True and False values.

In [23]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31447 entries, 0 to 31562
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   song                       31447 non-null  object        
 1   artist                     31447 non-null  object        
 2   this_week                  31447 non-null  int64         
 3   peak_position              31447 non-null  int64         
 4   weeks_on_chart             31447 non-null  int64         
 5   date                       31447 non-null  datetime64[ns]
 6   year                       31447 non-null  int32         
 7   spotify_genre              27623 non-null  object        
 8   spotify_track_duration_ms  24153 non-null  float64       
 9   spotify_track_explicit     24153 non-null  category      
 10  danceability               24099 non-null  float64       
 11  energy                     24099 non-null  float64       
 12  key      

I would rather have the second rather than the milliseconds as it will be easier to understand.

In [25]:
merged_data.spotify_track_duration_ms = merged_data.spotify_track_duration_ms / 1000
merged_data = merged_data.rename(columns = {})

Some songs have more than one artist singing them. I want to extract the main artist, so that I can later fill in the spotify features for the songs that are missing them right now. based on the main artist's name, to hopefully be able to fill in more of the missing lyrics. 

Some values in the artist column have strings like 'featuring', '+', 'and' etc. I will also create a column that will contain a list of all the artists. This can be useful for later data analysis.

I will use the `get_artist` function I created, which lets me get either the main artist or a list of all artists, depending on if the parameter `main_only` is set to **True** or **False**.

In [27]:
merged_data['main_artist'] = merged_data.artist.apply(
    lambda x: data_collection.get_artists(x, main_only = True)
) #Get the main artist of the song
merged_data['artist_list'] = merged_data.artist.apply(
    lambda x: data_collection.get_artists(x, main_only = False)
) #Get a list of all artists for a song

In [28]:
#Save the clean merged data
merged_data = merged_data.rename(columns = {
    "spotify_track_popularity" : "spotify_popularity", 
    "spotify_track_explicit" : "explicit_track", 
    "spotify_track_duration_ms" : "track_duration_s"
})

merged_data.to_csv("data/hot_100_spotify.csv")

## Song Lyrics

The third dataset I will be using contains song names and lyrics for each song, along with sentiment analysis of the lyrics.

In [30]:
song_lyrics = pd.read_excel("data/Final processed dataset.xlsx")

In [31]:
song_lyrics

Unnamed: 0.1,Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,...,anticipation_normalized,fear,fear_normalized,surprise,surprise_normalized,emo_score,happy,happy_normalized,sorrow,sorrow_normalized
0,0,http://www.billboard.com/charts/hot-100/1963-0...,6/1/1963,11,Still,bill anderson,StillBill Anderson,1.0,17.0,11.0,...,0.020294,3.387,0.033206,1.516,0.014863,31.281,19.907,0.195167,11.374,0.111510
1,1,http://www.billboard.com/charts/hot-100/1967-0...,1/7/1967,11,Coming Home Soldier,bobby vinton,Coming Home SoldierBobby Vinton,1.0,17.0,11.0,...,0.005624,0.000,0.000000,0.000,0.000000,17.127,11.262,0.121097,5.865,0.063065
2,3,http://www.billboard.com/charts/hot-100/1975-1...,11/29/1975,11,Saturday Night,bay city rollers,Saturday NightBay City Rollers,1.0,17.0,11.0,...,0.005283,0.000,0.000000,0.438,0.003021,9.760,8.994,0.062028,0.766,0.005283
3,5,http://www.billboard.com/charts/hot-100/1987-0...,9/19/1987,11,Carrie,europe,CarrieEurope,1.0,17.0,11.0,...,0.058026,5.526,0.047231,5.625,0.048077,35.788,11.495,0.098248,24.293,0.207632
4,6,http://www.billboard.com/charts/hot-100/1987-1...,10/3/1987,11,Casanova,levert,CasanovaLevert,1.0,17.0,11.0,...,0.023054,1.242,0.007393,0.422,0.002512,40.625,30.756,0.183071,9.869,0.058744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22595,325636,http://www.billboard.com/charts/hot-100/1985-0...,7/27/1985,82,You Look Marvelous,billy crystal,You Look MarvelousBilly Crystal,1.0,,82.0,...,0.008592,4.212,0.015485,9.977,0.036680,65.506,52.148,0.191721,13.358,0.049110
22596,325648,http://www.billboard.com/charts/hot-100/1963-0...,9/28/1963,70,You Lost The Sweetest Boy,mary wells,You Lost The Sweetest BoyMary Wells,1.0,,70.0,...,0.032139,7.207,0.062670,1.915,0.016652,45.737,16.471,0.143226,29.266,0.254487
22597,325657,http://www.billboard.com/charts/hot-100/1969-0...,8/2/1969,96,You Made A Believer (Out Of Me),ruby andrews,You Made A Believer (Out Of Me)Ruby Andrews,1.0,,96.0,...,0.021875,0.000,0.000000,0.852,0.007607,42.419,37.528,0.335071,4.891,0.043670
22598,325659,http://www.billboard.com/charts/hot-100/1977-0...,6/4/1977,69,You Made Me Believe In Magic,bay city rollers,You Made Me Believe In MagicBay City Rollers,1.0,,69.0,...,0.000000,0.000,0.000000,0.000,0.000000,20.485,19.766,0.366037,0.719,0.013315


In [32]:
song_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22600 entries, 0 to 22599
Data columns (total 49 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               22600 non-null  int64  
 1   url                      22600 non-null  object 
 2   WeekID                   22600 non-null  object 
 3   Week Position            22600 non-null  int64  
 4   Song                     22600 non-null  object 
 5   Performer                22600 non-null  object 
 6   SongID                   22600 non-null  object 
 7   Instance                 22046 non-null  float64
 8   Previous Week Position   5058 non-null   float64
 9   Peak Position            22046 non-null  float64
 10  Weeks on Chart           22046 non-null  float64
 11  Lyrics                   22600 non-null  object 
 12  Artist                   22600 non-null  object 
 13  words                    22600 non-null  object 
 14  wordCount             

In [33]:
song_lyrics.columns

Index(['Unnamed: 0', 'url', 'WeekID', 'Week Position', 'Song', 'Performer',
       'SongID', 'Instance', 'Previous Week Position', 'Peak Position',
       'Weeks on Chart', 'Lyrics', 'Artist', 'words', 'wordCount', 'languages',
       'all_words', 'allWordCount', 'year', 'decade', 'MTLD', 'TTR', 'CTTR',
       'sentimentScore', 'sentimentScore_pos', 'sentimentScore_neg',
       'emo_words', 'emoWordCount', 'joy', 'joy_normalized', 'sadness',
       'sadness_normalized', 'anger', 'anger_normalized', 'disgust',
       'disgust_normalized', 'trust', 'trust_normalized', 'anticipation',
       'anticipation_normalized', 'fear', 'fear_normalized', 'surprise',
       'surprise_normalized', 'emo_score', 'happy', 'happy_normalized',
       'sorrow', 'sorrow_normalized'],
      dtype='object')

In [34]:
song_lyrics[song_lyrics.Song == "The Hills"][['Peak Position', 'Weeks on Chart']]

Unnamed: 0,Peak Position,Weeks on Chart
22501,20.0,1.0


After examining some song's peak position from the `hot_100` dataset and comparing it to the dataset with song lyrics, I noticed that there is a differente between them. It looks like the peak positions are not extracted properly in the lyrics dataset. I will fix that by merging the two datasets and keep only the peak position from the `hot_100` dataset. 

**Example**: The Weeknd - The Hills : there is a difference in the peak position and the weeks on chart between the two dataets.

In [36]:
hot_100[hot_100.song == "The Hills"]

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date,year
4972,The Hills,The Weeknd,48,39.0,1,48,2016-05-07,2015


This dataset has A LOT of features, some of which won't be needed. I will drop the columns that won't be needed and then make the column names in snake case.

The columns `Artist` and `Performer` both contain artist names, however only the main artist name is kept in the former. This is why I am going to keep it, rather than `Performer`, which keeps information about more than one artist if there are featuring artists.

`TTR`, `MTLD` and `CTTR` are measures for lexical diversity, by making comprasions between total word count and unique word count. Among htem `CTTR` is the superior method, which provides the most comprehensive and reliable measure of lexical diversity, making it the best choice. I will drop the other two.

I will also drop the columns for the non-normalized features and rename the normalized ones.

In [38]:
song_lyrics = song_lyrics.drop(
    columns = [ 'Unnamed: 0', 'url', 'WeekID', 'Week Position',       
        'SongID', 'Instance', 'Previous Week Position', 'Peak Position',
        'Weeks on Chart', 'Performer', 'sentimentScore_pos', 'sentimentScore_neg',
        'joy', 'sadness', 'anger', 
        'disgust', 'trust', 'anticipation',
        'fear', 'surprise', 'happy', 'sorrow',
        'languages', 'decade', 'CTTR', 'TTR' 
    
])

In [39]:
song_lyrics.columns = (
    song_lyrics.columns
    .str.replace(' ', '_')  # Replace spaces with underscores
    .str.replace(r'(?<![A-Z])([A-Z])(?![A-Z])', r'_\1', regex=True)  # Underscore before single capital letters
    .str.lower()  # Convert to lowercase
    .str.strip('_') #Drop underscores, which were incorrectly put at the beginning of some column names
)

In [40]:
for col in song_lyrics.columns:
    song_lyrics = song_lyrics.rename(columns = {col: col.replace('_normalized', '')})

The regex does the following:
* `(?<![A-Z])`: Negative lookbehind to ensure the character before the current letter is not a capital letter.
* `([A-Z])`: Matches a single uppercase letter.
* `(?![A-Z])`: Negative lookahead to ensure the character after the current letter is not a capital letter.
* `r'_\1'`: Replaces the matched letter with an underscore followed by the letter itself.

In [42]:
song_lyrics.columns

Index(['song', 'lyrics', 'artist', 'words', 'word_count', 'all_words',
       'all_word_count', 'year', 'mtld', 'sentiment_score', 'emo_words',
       'emo_word_count', 'joy', 'sadness', 'anger', 'disgust', 'trust',
       'anticipation', 'fear', 'surprise', 'emo_score', 'happy', 'sorrow'],
      dtype='object')

Now I will merge the lyric dataset with the spotify features dataset to get correct information about the songs' chart performance and get the spotify genres.

In [44]:
#Load spotify features dataset
spotify = pd.read_csv("data/filled_spotify.csv")
spotify = spotify[['song', 'main_artist', 'peak_position', 'weeks_on_chart', 'date', 'year', 'spotify_genre']]

In [45]:
spotify.main_artist = spotify.main_artist.str.lower()

In [46]:
hot_100_lyrics = pd.merge(left = spotify, right = song_lyrics, left_on = ['song', 'main_artist'], right_on = ['song', 'artist'], how = 'inner')
hot_100_lyrics = hot_100_lyrics.drop_duplicates(subset = ['song', 'artist'], keep = 'last')

In [47]:
hot_100_lyrics = hot_100_lyrics.drop(columns = ['artist', 'year_y'])
hot_100_lyrics = hot_100_lyrics.rename(columns = {'year_x': 'year'})

In [48]:
hot_100_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18750 entries, 0 to 18769
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   song             18750 non-null  object 
 1   main_artist      18750 non-null  object 
 2   peak_position    18750 non-null  int64  
 3   weeks_on_chart   18750 non-null  int64  
 4   date             18750 non-null  object 
 5   year             18750 non-null  int64  
 6   spotify_genre    18690 non-null  object 
 7   lyrics           18750 non-null  object 
 8   words            18750 non-null  object 
 9   word_count       18750 non-null  int64  
 10  all_words        18750 non-null  object 
 11  all_word_count   18750 non-null  int64  
 12  mtld             18750 non-null  float64
 13  sentiment_score  18750 non-null  float64
 14  emo_words        18750 non-null  object 
 15  emo_word_count   18750 non-null  int64  
 16  joy              18750 non-null  float64
 17  sadness          

In [49]:
hot_100_lyrics.date = pd.to_datetime(hot_100_lyrics.date)
print(f"Start year: {hot_100_lyrics.year.min()}\nMost recent year: {hot_100_lyrics.year.max()}")

Start year: 1958
Most recent year: 2020


By merging the data from the two datasets, we are left with a total of 18750 songs, released in the period of 1958 to 2020.

After examining the columns, containing the words of the lyrics, I noticed that some stopwords are not removed, which is something I think should be done in order to get more meaningful results when later analysing the lyrics.

In [51]:
hot_100_lyrics.words.head(15)

0     ['yeah', 'fever', 'dream', 'high', 'quiet', 'n...
1     ['get', 'fast', 'car', 'want', 'ticket', 'anyw...
2     ['rockin', 'around', 'christmas', 'tree', 'par...
3     ['want', 'lot', 'christmas', 'one', 'thing', '...
4     ['dingdongding', 'holly', 'jolly', 'christmas'...
5     ['wonderful', 'time', 'year', 'kid', 'jingle',...
6     ['letra', 'de', 'feliz', 'navidad', 'estribill...
7     ['hear', 'sleigh', 'bell', 'jingling', 'ring',...
8     ['santa', 'tell', 'really', 'make', 'fall', 'l...
9     ['snow', 'fall', 'caroler', 'sing', 'alone', '...
11    ['im', 'dream', 'white', 'christmas', 'like', ...
12    ['christmas', 'snow', 'come', 'im', 'watch', '...
15    ['jingle', 'bell', 'way', 'oh', 'fun', 'ride',...
16    ['ill', 'blue', 'christmas', 'without', 'ooh',...
17    ['come', 'santa', 'claus', 'right', 'lane', 'v...
Name: words, dtype: object

We can see words like 'ill', 'im', 'yeah' etc. are still counted. Lets remove stopwords by using the `stopwords` set, provided by the `nltk` library. It won't remove all stopwords, so I will remove some by myself.

In [53]:
stop_words = set(stopwords.words('english'))
additional_stop_words = {'yeah','im','ill', 'oh', 'youre', 'id', ''}
stop_words.update(additional_stop_words)

#Turn the string into a list
hot_100_lyrics.all_words = hot_100_lyrics.all_words.apply(ast.literal_eval)

In [54]:
#Remove stopwords
hot_100_lyrics.all_words = hot_100_lyrics.all_words.apply(lambda x: data_collection.remove_stopwords(x, stop_words))

In [55]:
hot_100_lyrics.all_words.head(15)

0     [fever, dream, high, quiet, night, know, caugh...
1     [get, fast, car, want, ticket, anywhere, maybe...
2     [rockin, around, christmas, tree, christmas, p...
3     [want, lot, christmas, one, thing, need, care,...
4     [dingdongding, dingdongding, dingdongding, hol...
5     [wonderful, time, year, kid, jingle, belling, ...
6     [letra, de, feliz, navidad, estribillo, feliz,...
7     [hear, sleigh, bell, jingling, ring, tingle, t...
8     [santa, tell, really, make, fall, love, next, ...
9     [snow, fall, caroler, sing, alone, christmas, ...
11    [dream, white, christmas, like, one, use, know...
12    [christmas, snow, come, christmas, watch, fall...
15    [jingle, bell, jingle, bell, jingle, way, fun,...
16    [blue, christmas, without, ooh, ooh, ooh, ooh,...
17    [come, santa, claus, come, santa, claus, right...
Name: all_words, dtype: object

Now I need to count the words and get the unique words again.

In [57]:
hot_100_lyrics.all_word_count = hot_100_lyrics.all_words.apply(lambda x: len(x))
hot_100_lyrics.words = hot_100_lyrics.all_words.apply(lambda x: set(x))
hot_100_lyrics.word_count = hot_100_lyrics.words.apply(lambda x: len(x))

I will create a new column that contains only words that are either nouns or adjectives, using the `pos_tag` function from the `nltk` library. All tags for nouns start with **'NN'** and all tags for adjectives start with **'JJ'**. This will be useful when analyzing the topic of the song.

In [59]:
all_rows = []
nltk_tags = [
    # Adjectives
    'JJ',  # Adjective
    'JJR', # Adjective, comparative
    'JJS', # Adjective, superlative

    # Nouns
    'NN',  # Noun, singular or mass
    'NNS', # Noun, plural
]

for i, row in hot_100_lyrics.iterrows():
    tagged_words = nltk.pos_tag(row['words'])
    filtered_words = [word for word,tag in tagged_words if tag in nltk_tags)]
    all_rows.append(filtered_words)

hot_100_lyrics['nouns_adjectives'] = all_rows

Iterating through the rows gave me the fastest result, even tho it's still not that fast.

In [60]:
#Save the dataset
hot_100_lyrics.to_csv("data/hot_100_lyrics.csv")