In [18]:
import pandas as pd
import re
from pathlib import Path

DATA_RAW = Path("../data/raw")
DATA_CLEAN = Path("../data/cleaned")
DATA_CLEAN.mkdir(exist_ok=True)


In [19]:
emotion_raw = pd.read_csv(DATA_RAW / "emotion_raw.csv")
emotion_raw.info()
emotion_raw.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [20]:
emotion_df = emotion_raw.copy()

# Drop useless index column if it exists
if "Unnamed: 0" in emotion_df.columns:
    emotion_df = emotion_df.drop(columns=["Unnamed: 0"])

# Rename to emotion_id
emotion_df = emotion_df.rename(columns={"label": "emotion_id"})

# Basic text cleaning
emotion_df["text"] = (
    emotion_df["text"]
    .astype(str)
    .str.strip()
)

# Remove empty / very short texts
emotion_df = emotion_df.dropna(subset=["text", "emotion_id"])
emotion_df = emotion_df[emotion_df["text"].str.len() > 5]

emotion_df.head()


Unnamed: 0,text,emotion_id
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [22]:
emotion_df.to_csv(DATA_CLEAN / "emotion_clean.csv", index=False)
emotion_df["emotion_id"].value_counts()


emotion_id
1    141065
0    121184
3     57313
4     47708
2     34554
5     14972
Name: count, dtype: int64

In [23]:
songs_raw = pd.read_csv(DATA_RAW / "songs_raw.csv")
songs_raw.info()
songs_raw.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        799 non-null    object
 1   Artist      799 non-null    object
 2   Album       799 non-null    object
 3   Popularity  799 non-null    int64 
 4   Lyrics      799 non-null    object
dtypes: int64(1), object(4)
memory usage: 31.3+ KB


Unnamed: 0,Name,Artist,Album,Popularity,Lyrics
0,Imagine - Remastered 2010,John Lennon,Imagine,79,8 ContributorsDiscographie 2021 — Pop & Rock L...
1,A Whiter Shade Of Pale,Procol Harum,A Whiter Shade Of Pale,0,38 ContributorsA Whiter Shade of Pale Lyrics[I...
2,My Sweet Lord,George Harrison,All Things Must Pass (Remastered),0,58 ContributorsMy Sweet Lord Lyrics[Chorus]\nM...
3,God Only Knows - Mono,The Beach Boys,Pet Sounds (Original Mono & Stereo Mix),67,1 ContributorGod Only Knows (mono mix) LyricsI...
4,Bridge Over Troubled Water,Simon & Garfunkel,Bridge Over Troubled Water,72,57 ContributorsBridge Over Troubled Water Lyri...


In [24]:
songs_df = songs_raw.copy()
songs_df["Lyrics"] = songs_df["Lyrics"].astype(str)

junk_markers = [
    "Discographie 2021",
    "Calendrier 2021JANFÉV",
    "Calendrier 2021JANFÃ‰V",
]

junk_mask = False
for marker in junk_markers:
    junk_mask = junk_mask | songs_df["Lyrics"].str.contains(marker, case=False, na=False)

print("Rows flagged as junk:", junk_mask.sum())
songs_df = songs_df[~junk_mask].copy()


Rows flagged as junk: 2


In [28]:
songs_df = songs_df.rename(columns={
    "Name": "title",
    "Artist": "artist",
    "Album": "album",
    "Popularity": "popularity",
    "Lyrics": "lyrics",
})

def clean_lyrics(text: str) -> str:
    t = str(text)
    t = t.replace("\xa0", " ")
    t = t.lower()

    # remove the "X contributors" junk at the start
    t = re.sub(r"^\d+\s*contributors", " ", t)

    # remove "lyrics" header near the beginning
    t = re.sub(r"^lyrics", " ", t)
    t = t.replace("contributors", " ")

    # remove anything between brackets [intro], [chorus], etc.
    t = re.sub(r"\[.*?\]", " ", t)

    # keep only letters/apostrophes/spaces
    t = re.sub(r"[^a-z'\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

songs_df["clean_lyrics"] = songs_df["lyrics"].apply(clean_lyrics)
songs_df["clean_len"] = songs_df["clean_lyrics"].str.len()

songs_df[["title", "artist", "clean_lyrics"]].head()
songs_df["clean_len"].describe()


count      611.000000
mean      2165.793781
std       1609.380298
min        185.000000
25%       1353.500000
50%       1748.000000
75%       2473.500000
max      17877.000000
Name: clean_len, dtype: float64

In [29]:
# Remove lines that are still clearly junk
bad_start = songs_df["clean_lyrics"].str.startswith("contributorsdiscographie")
too_short = songs_df["clean_len"] < 50
too_long = songs_df["clean_len"] > 20000
has_discographie = songs_df["clean_lyrics"].str.contains("discographie", case=False, na=False)

songs_df = songs_df[~(bad_start | too_short | too_long | has_discographie)].copy()

# Final tiny cleanup
songs_df = songs_df.drop(columns=["clean_len"])
songs_df = songs_df.drop_duplicates(subset=["title", "artist"])

songs_df[["title", "artist", "clean_lyrics"]].head()


Unnamed: 0,title,artist,clean_lyrics
1,A Whiter Shade Of Pale,Procol Harum,a whiter shade of pale lyrics we skipped the l...
2,My Sweet Lord,George Harrison,my sweet lord lyrics my sweet lord mmm my lord...
3,God Only Knows - Mono,The Beach Boys,contributorgod only knows mono mix lyricsi may...
4,Bridge Over Troubled Water,Simon & Garfunkel,bridge over troubled water lyrics when you're ...
5,Bohemian Rhapsody - Remastered 2011,Queen,contributortop groups of lyricsbts butter maro...


In [30]:
songs_df.to_csv(DATA_CLEAN / "songs_clean.csv", index=False)
print("Final songs_clean shape:", songs_df.shape)
songs_df["clean_lyrics"].str.len().describe()


Final songs_clean shape: (611, 6)


count      611.000000
mean      2165.793781
std       1609.380298
min        185.000000
25%       1353.500000
50%       1748.000000
75%       2473.500000
max      17877.000000
Name: clean_lyrics, dtype: float64