Imports & paths

In [1]:
import pandas as pd
import re
from pathlib import Path

# Directories
DATA_RAW = Path("../data/raw")
DATA_CLEAN = Path("../data/cleaned")
DATA_CLEAN.mkdir(exist_ok=True)

print("Raw data dir:", DATA_RAW.resolve())
print("Clean data dir:", DATA_CLEAN.resolve())


Raw data dir: D:\Sheridan\First semester\Python\LyriSense\data\raw
Clean data dir: D:\Sheridan\First semester\Python\LyriSense\data\cleaned


Load & inspect raw emotion dataset

In [2]:
emotion_raw = pd.read_csv(DATA_RAW / "emotion_raw.csv")

print("=== Raw Emotion Dataset ===")
emotion_raw.info()
emotion_raw.head()


=== Raw Emotion Dataset ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


Clean emotion dataset

In [3]:
# Work on a copy
emotion_df = emotion_raw.copy()

# Drop useless index column if it exists
if "Unnamed: 0" in emotion_df.columns:
    emotion_df = emotion_df.drop(columns=["Unnamed: 0"])

# Rename label → emotion_id for clarity
emotion_df = emotion_df.rename(columns={"label": "emotion_id"})

# Basic text cleaning: make sure text is string and trimmed
emotion_df["text"] = (
    emotion_df["text"]
    .astype(str)
    .str.strip()
)

# Remove rows with missing emotion/text
emotion_df = emotion_df.dropna(subset=["text", "emotion_id"])

# Remove very short texts (likely noise)
MIN_TEXT_LEN = 5
emotion_df = emotion_df[emotion_df["text"].str.len() > MIN_TEXT_LEN].copy()

print("=== Clean Emotion Dataset Preview ===")
emotion_df.head()


=== Clean Emotion Dataset Preview ===


Unnamed: 0,text,emotion_id
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


Save + quick stats for emotion dataset

In [4]:
emotion_output_path = DATA_CLEAN / "emotion_clean.csv"
emotion_df.to_csv(emotion_output_path, index=False)

print(f"Saved cleaned emotion data to: {emotion_output_path}")
emotion_df["emotion_id"].value_counts()


Saved cleaned emotion data to: ..\data\cleaned\emotion_clean.csv


emotion_id
1    141065
0    121184
3     57313
4     47708
2     34554
5     14972
Name: count, dtype: int64

Load & inspect raw songs dataset

In [5]:
songs_raw = pd.read_csv(DATA_RAW / "songs_raw.csv")

print("=== Raw Songs Dataset ===")
songs_raw.info()
songs_raw.head()


=== Raw Songs Dataset ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        799 non-null    object
 1   Artist      799 non-null    object
 2   Album       799 non-null    object
 3   Popularity  799 non-null    int64 
 4   Lyrics      799 non-null    object
dtypes: int64(1), object(4)
memory usage: 31.3+ KB


Unnamed: 0,Name,Artist,Album,Popularity,Lyrics
0,Imagine - Remastered 2010,John Lennon,Imagine,79,8 ContributorsDiscographie 2021 — Pop & Rock L...
1,A Whiter Shade Of Pale,Procol Harum,A Whiter Shade Of Pale,0,38 ContributorsA Whiter Shade of Pale Lyrics[I...
2,My Sweet Lord,George Harrison,All Things Must Pass (Remastered),0,58 ContributorsMy Sweet Lord Lyrics[Chorus]\nM...
3,God Only Knows - Mono,The Beach Boys,Pet Sounds (Original Mono & Stereo Mix),67,1 ContributorGod Only Knows (mono mix) LyricsI...
4,Bridge Over Troubled Water,Simon & Garfunkel,Bridge Over Troubled Water,72,57 ContributorsBridge Over Troubled Water Lyri...


Copy + basic cleanup of songs

In [6]:
songs_df = songs_raw.copy()

# Ensure lyrics are strings
songs_df["Lyrics"] = songs_df["Lyrics"].astype(str)

print("Initial number of songs:", len(songs_df))


Initial number of songs: 799


Remove obvious junk rows (calendar / discographie garbage)

In [7]:
junk_markers = [
    "Discographie 2021",
    "Calendrier 2021JANFÉV",
    "Calendrier 2021JANFÃ‰V",
]

# Flag lyrics that contain any junk marker
junk_mask = False
for marker in junk_markers:
    junk_mask = junk_mask | songs_df["Lyrics"].str.contains(marker, case=False, na=False)

print("Rows flagged as junk:", junk_mask.sum())

songs_df = songs_df[~junk_mask].copy()
print("Remaining after junk removal:", len(songs_df))


Rows flagged as junk: 2
Remaining after junk removal: 797


Rename core columns

In [8]:
songs_df = songs_df.rename(columns={
    "Name": "title",
    "Artist": "artist",
    "Album": "album",
    "Popularity": "popularity",
    "Lyrics": "lyrics",
})

songs_df[["title", "artist", "album"]].head()


Unnamed: 0,title,artist,album
1,A Whiter Shade Of Pale,Procol Harum,A Whiter Shade Of Pale
2,My Sweet Lord,George Harrison,All Things Must Pass (Remastered)
3,God Only Knows - Mono,The Beach Boys,Pet Sounds (Original Mono & Stereo Mix)
4,Bridge Over Troubled Water,Simon & Garfunkel,Bridge Over Troubled Water
5,Bohemian Rhapsody - Remastered 2011,Queen,A Night At The Opera (2011 Remaster)


Define lyrics cleaning function

In [9]:
def clean_lyrics(text: str) -> str:
    """
    Basic lyrics normalization:
    - lowercasing
    - remove non-letter characters
    - remove contributor / bracket sections
    """
    t = str(text)

    # normalize spaces + lowercase
    t = t.replace("\xa0", " ")
    t = t.lower()

    # remove the "X contributors" junk at the start
    t = re.sub(r"^\d+\s*contributors", " ", t)

    # remove "lyrics" header near the beginning
    t = re.sub(r"^lyrics", " ", t)
    t = t.replace("contributors", " ")

    # remove anything between brackets [intro], [chorus], etc.
    t = re.sub(r"\[.*?\]", " ", t)

    # keep only letters/apostrophes/spaces
    t = re.sub(r"[^a-z'\s]", " ", t)

    # collapse multiple spaces
    t = re.sub(r"\s+", " ", t).strip()
    return t


Apply lyrics cleaning + length stats

In [10]:
songs_df["clean_lyrics"] = songs_df["lyrics"].apply(clean_lyrics)
songs_df["clean_len"] = songs_df["clean_lyrics"].str.len()

print("Clean lyrics preview:")
songs_df[["title", "artist", "clean_lyrics"]].head()
songs_df["clean_len"].describe()


Clean lyrics preview:


count       797.000000
mean       6113.750314
std       35946.439656
min           9.000000
25%        1341.000000
50%        1728.000000
75%        2471.000000
max      587331.000000
Name: clean_len, dtype: float64

Filter out remaining junk / extremes

In [11]:
# Heuristics to drop clearly bad rows
bad_start = songs_df["clean_lyrics"].str.startswith("contributorsdiscographie", na=False)
too_short = songs_df["clean_len"] < 50        # almost no text
too_long = songs_df["clean_len"] > 20_000    # insanely long
has_discographie = songs_df["clean_lyrics"].str.contains("discographie", case=False, na=False)

to_drop_mask = bad_start | too_short | too_long | has_discographie

print("Rows to drop based on heuristics:", to_drop_mask.sum())

songs_df = songs_df[~to_drop_mask].copy()
print("Remaining after length/junk filters:", len(songs_df))


Rows to drop based on heuristics: 50
Remaining after length/junk filters: 747


Final touch: drop helper columns + duplicates, save

In [12]:
# We don't need clean_len in the final saved CSV
songs_df = songs_df.drop(columns=["clean_len"])

# Deduplicate by (title, artist)
before_dedup = len(songs_df)
songs_df = songs_df.drop_duplicates(subset=["title", "artist"])
after_dedup = len(songs_df)

print(f"Removed {before_dedup - after_dedup} duplicate rows (title, artist).")
print("Final songs_clean shape:", songs_df.shape)

songs_output_path = DATA_CLEAN / "songs_clean.csv"
songs_df.to_csv(songs_output_path, index=False)

print(f"Saved cleaned songs data to: {songs_output_path}")


# Sanity check on final clean lyrics length
songs_df["clean_lyrics"].str.len().describe()


Removed 136 duplicate rows (title, artist).
Final songs_clean shape: (611, 6)
Saved cleaned songs data to: ..\data\cleaned\songs_clean.csv


count      611.000000
mean      2165.793781
std       1609.380298
min        185.000000
25%       1353.500000
50%       1748.000000
75%       2473.500000
max      17877.000000
Name: clean_lyrics, dtype: float64