In [168]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

# Data

In [200]:
df = pd.read_csv('lyrics.csv')
# Removing any songs without lyrics 
df = df[~pd.isna(df['lyrics'])]
df.head()

# Defining a dataset with nonempty descriptions for *maybe* future use
data = df.query("description != '?'")

In [183]:
# Original data description 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 382 entries, 0 to 384
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist        382 non-null    object
 1   title         382 non-null    object
 2   lyrics        382 non-null    object
 3   description   382 non-null    object
 4   release date  373 non-null    object
dtypes: object(5)
memory usage: 17.9+ KB


In [184]:
# Nonempty description data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316 entries, 0 to 384
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist        316 non-null    object
 1   title         316 non-null    object
 2   lyrics        316 non-null    object
 3   description   316 non-null    object
 4   release date  313 non-null    object
dtypes: object(5)
memory usage: 14.8+ KB


# Data Cleaning


### Cleaning `lyrics`, `artist`, and `title`  
In this section we are removing any unecessary text found in the lyrics column. In addition, we are lowercasing all other textual columns. 

In [201]:
def clean_lyrics(test_str):
    
    test_str = str(test_str)
    # 0) Removing apostrophes for tokenization purposes
    test_str = test_str.lower().replace("'", "")
    
    # 1) Removing unecessary textual data 
    res = re.search(r'lyrics' , test_str)
    emb_res = re.search(r'\d*embed$', test_str)

    test_str = test_str[res.end():emb_res.start()]

    # 2) Removing any punctuation (except parantheses)
    test_str = re.sub(r'[.,\-?:!;]', '', test_str)

    return test_str

In [202]:
df['lyrics'] = df['lyrics'].apply(clean_lyrics)
df['artist'] = df['artist'].str.lower()
df['title'] = df['title'].str.lower()

In [203]:
df

Unnamed: 0,artist,title,lyrics,description,release date
0,ray charles,hit the road jack,\nhit the road jack and doncha come back\nno m...,This tongue and cheek verbal duel of a couple ...,August 1961
1,ray charles,georgia on my mind,\ngeorgia\ngeorgia\nthe whole day through\n(th...,Written by Hoagy Carmichael and Stuart Gorrell...,September 1960
2,ray charles,i’ve got a woman,\nwell\n\ni got a woman way over town\nthats g...,Ray Charles released “I’ve Got a Woman” as a s...,December 1954
3,ray charles,i can’t stop loving you,\n(i cant stop loving you)\nive made up my min...,?,1962
4,ken nordine,yellow,in the beginning\noh long before that\nwhen li...,?,"January 1, 1966"
...,...,...,...,...,...
380,raveena,temptation,ahahah\nahahah\n\nmiss temptation i dont think...,"In “Temptation”, Raveena opens up about her bi...","October 23, 2018"
381,the notorious b.i.g.,juicy,"\n(""fuck all you hoes"" get a grip motherfucker...",“Juicy” is the first single from Big’s debut a...,"August 9, 1994"
382,the notorious b.i.g.,big poppa,\nuh uh check it out (yeah) uh\njunior mafia u...,“Big Poppa” was The Notorious B.I.G.’s first t...,"February 20, 1995"
383,the notorious b.i.g.,suicidal thoughts,\nhello\naw shit nigga the fuck time is it man...,In this final track off of The Notorious B.I.G...,"September 13, 1994"
