In [4]:
import csv
import os
import re
import csv
from collections import Counter
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import genius_utils_v3 as genius_utils

In [12]:
# Setup Genius API client
# Make sure you create a file called ".env" with contents being the same as ".env.example," but include your API key
# An API key can be obtained for free from https://genius.com/api-clients
genius = genius_utils.setup_genius_client()

# Run the pipeline for a sample artist
results = genius_utils.get_clean_explicit_pairs(genius, "Eminem", max_songs=100, debug=False)

for r in results:
    print(f"\n {r['base_title']}")
    print(f"  - Explicit: {r['explicit'][0]} → {r['explicit'][2]}")
    print(f"  - Clean:    {r['clean'][0]} → {r['clean'][2]}")


  song = Song(self, song_info, lyrics)
INFO:utils.genius_utils_v3:Scanning: Rap God
  song = Song(self, song_info, lyrics)
INFO:utils.genius_utils_v3:Scanning: Lose Yourself
INFO:utils.genius_utils_v3:Scanning: Mockingbird
INFO:utils.genius_utils_v3:Scanning: Without Me
INFO:utils.genius_utils_v3:Scanning: The Real Slim Shady
INFO:utils.genius_utils_v3:Scanning: Stan
INFO:utils.genius_utils_v3:Scanning: Killshot
INFO:utils.genius_utils_v3:Scanning: Godzilla
INFO:utils.genius_utils_v3:Scanning: Houdini
INFO:utils.genius_utils_v3:Scanning: The Monster
INFO:utils.genius_utils_v3:Scanning: Superman
INFO:utils.genius_utils_v3:Scanning: Lucky You
INFO:utils.genius_utils_v3:Scanning: ’Till I Collapse
INFO:utils.genius_utils_v3:Scanning: The Ringer
INFO:utils.genius_utils_v3:Scanning: Love the Way You Lie
INFO:utils.genius_utils_v3:Scanning: Venom
INFO:utils.genius_utils_v3:Scanning: River
INFO:utils.genius_utils_v3:Scanning: 8 Mile: B-Rabbit vs Papa Doc
INFO:utils.genius_utils_v3:Scanning: No


 Rap God
  - Explicit: Rap God → https://genius.com/Eminem-rap-god-lyrics
  - Clean:    Rap God (Clean) → https://genius.com/Eminem-rap-god-clean-lyrics

 Without Me
  - Explicit: Without Me → https://genius.com/Eminem-without-me-lyrics
  - Clean:    Without Me (Clean Version) → https://genius.com/Eminem-without-me-clean-version-lyrics

 The Real Slim Shady
  - Explicit: The Real Slim Shady → https://genius.com/Eminem-the-real-slim-shady-lyrics
  - Clean:    The Real Slim Shady (Clean) → https://genius.com/Eminem-the-real-slim-shady-clean-lyrics

 Stan
  - Explicit: Stan → https://genius.com/Eminem-stan-lyrics
  - Clean:    Stan (Clean) → https://genius.com/Eminem-stan-clean-lyrics

 Godzilla
  - Explicit: Godzilla → https://genius.com/Eminem-godzilla-lyrics
  - Clean:    Godzilla (Super Clean) → https://genius.com/Eminem-godzilla-super-clean-lyrics

 Superman
  - Explicit: Superman → https://genius.com/Eminem-superman-lyrics
  - Clean:    Superman (Clean Radio Edit) → https://genius.

In [13]:
for pair in results:
    # Only save if both versions exist
    if not pair.get("explicit") or not pair.get("clean"):
        continue

    title_base = pair["base_title"]
    
    # Extract raw lyrics from both
    raw_explicit = pair["explicit"][1]
    raw_clean = pair["clean"][1]

    # Clean and strip headers
    intermediate_explicit = genius_utils.clean_lyrics_text(raw_explicit)
    intermediate_clean = genius_utils.clean_lyrics_text(raw_clean)

    # Final clean: remove punctuation + blank lines
    cleaned_explicit = genius_utils.remove_punctuation_and_blank_lines(intermediate_explicit)
    cleaned_clean = genius_utils.remove_punctuation_and_blank_lines(intermediate_clean)

    # Save to .txt files
    genius_utils.save_raw_and_cleaned_lyrics(title_base, "explicit", raw_explicit, cleaned_explicit)
    genius_utils.save_raw_and_cleaned_lyrics(title_base, "clean", raw_clean, cleaned_clean)

    # Optional print
    print(f"✅ Saved: {title_base}")

✅ Saved: Rap God
✅ Saved: Without Me
✅ Saved: The Real Slim Shady
✅ Saved: Stan
✅ Saved: Godzilla
✅ Saved: Superman
✅ Saved: Beautiful
✅ Saved: My Name Is
✅ Saved: Shake That
✅ Saved: Marshall Mathers
✅ Saved: FACK
✅ Saved: Role Model
✅ Saved: Guilty Conscience
✅ Saved: Just Don’t Give a Fuck
✅ Saved: Soldier


In [14]:
def merge_compounds(reference_tokens, target_tokens, max_compound_len=4):
    """
    Replace sequences of 2–max_compound_len tokens in target_tokens with compound
    forms found in reference_tokens.
    """
    i = 0
    while i < len(target_tokens):
        merged = False
        for n in range(max_compound_len, 1, -1):
            if i + n > len(target_tokens):
                continue
            phrase = target_tokens[i:i+n]
            candidate = ''.join(phrase)
            if candidate in reference_tokens:
                target_tokens[i:i+n] = [candidate]
                merged = True
                break
        if not merged:
            i += 1
    return target_tokens

def normalize_and_export(explicit_path, clean_path):
    # Read input files
    with open(explicit_path, 'r', encoding='utf-8') as f:
        explicit_tokens = f.read().split()

    with open(clean_path, 'r', encoding='utf-8') as f:
        clean_tokens = f.read().split()

    # Normalize each using the other
    clean_tokens = merge_compounds(explicit_tokens, clean_tokens)
    explicit_tokens = merge_compounds(clean_tokens, explicit_tokens)

    # Determine output file names
    explicit_out = os.path.splitext(explicit_path)[0] + "_normalized.txt"
    clean_out = os.path.splitext(clean_path)[0] + "_normalized.txt"

    # Write normalized files
    with open(explicit_out, 'w', encoding='utf-8') as f:
        f.write(' '.join(explicit_tokens))

    with open(clean_out, 'w', encoding='utf-8') as f:
        f.write(' '.join(clean_tokens))

    print(f"Normalized files saved as:\n  {explicit_out}\n  {clean_out}")


In [37]:
test_songs = ["Beautiful", "FACK","Guilty Conscience", "Just Don’t Give a Fuck", "Marshall Mathers", "My Name Is", "Role Model", "Shake That", "Soldier", "Godzilla", "Stan", "Superman", "Rap God", "The Real Slim Shady", "Without Me"]

In [38]:
for test_song in test_songs:
    normalize_and_export(    
    explicit_path="lyrics/clean/"+test_song+"__explicit_cleaned.txt",
    clean_path="lyrics/clean/"+test_song+"__clean_cleaned.txt",
)

Normalized files saved as:
  lyrics/clean/Beautiful__explicit_cleaned_normalized.txt
  lyrics/clean/Beautiful__clean_cleaned_normalized.txt
Normalized files saved as:
  lyrics/clean/FACK__explicit_cleaned_normalized.txt
  lyrics/clean/FACK__clean_cleaned_normalized.txt
Normalized files saved as:
  lyrics/clean/Guilty Conscience__explicit_cleaned_normalized.txt
  lyrics/clean/Guilty Conscience__clean_cleaned_normalized.txt
Normalized files saved as:
  lyrics/clean/Just Don’t Give a Fuck__explicit_cleaned_normalized.txt
  lyrics/clean/Just Don’t Give a Fuck__clean_cleaned_normalized.txt
Normalized files saved as:
  lyrics/clean/Marshall Mathers__explicit_cleaned_normalized.txt
  lyrics/clean/Marshall Mathers__clean_cleaned_normalized.txt
Normalized files saved as:
  lyrics/clean/My Name Is__explicit_cleaned_normalized.txt
  lyrics/clean/My Name Is__clean_cleaned_normalized.txt
Normalized files saved as:
  lyrics/clean/Role Model__explicit_cleaned_normalized.txt
  lyrics/clean/Role Model_

In [39]:
def clean_diff_variants(diffs):
    """
    Cleans diff pairs by removing:
    - '-in' vs '-ing' endings (e.g. killin vs killing)
    - past tense vs base verb (e.g. asked vs ask)
    - common misspellings/confusables (e.g. then vs than, woman vs women)
    - diff pairs where explicit is blank
    Returns a cleaned list of diffs.
    """
    filtered_diffs = []

    CONFUSABLE_PAIRS = {
        ("then", "than"),
        ("your", "youre"),
        ("there", "their"),
        ("its", "its"),  # since apostrophes are removed
        ("affect", "effect"),
        ("to", "too"),
        ("of", "off"),
        ("woman", "women"),
        ("womans", "womens"),
        ("man", "men"),
        ("mans", "mens")
    }

    for explicit_phrase, clean_phrase, aligned_word in diffs:
        explicit_words = explicit_phrase.split()
        clean_words = clean_phrase.split()

        # Skip if explicit is blank or only spaces
        if not explicit_words:
            continue

        set_explicit = set(explicit_words)
        set_clean = set(clean_words)

        to_remove_explicit = set()
        to_remove_clean = set()

        # 1. Remove -in vs -ing pairs
        for ew in set_explicit:
            if ew.endswith("in") and (ew + "g") in set_clean:
                to_remove_explicit.add(ew)
                to_remove_clean.add(ew + "g")
        for cw in set_clean:
            if cw.endswith("in") and (cw + "g") in set_explicit:
                to_remove_clean.add(cw)
                to_remove_explicit.add(cw + "g")

        # 2. Remove past tense vs base form (naive ed-stripping)
        for ew in set_explicit:
            if ew.endswith("ed"):
                base = ew[:-2]
                if base in set_clean:
                    to_remove_explicit.add(ew)
                    to_remove_clean.add(base)
        for cw in set_clean:
            if cw.endswith("ed"):
                base = cw[:-2]
                if base in set_explicit:
                    to_remove_clean.add(cw)
                    to_remove_explicit.add(base)

        # 3. Remove confusables
        for a, b in CONFUSABLE_PAIRS:
            if a in set_explicit and b in set_clean:
                to_remove_explicit.add(a)
                to_remove_clean.add(b)
            elif b in set_explicit and a in set_clean:
                to_remove_explicit.add(b)
                to_remove_clean.add(a)

        # Final cleanup
        new_explicit = [w for w in explicit_words if w not in to_remove_explicit]
        new_clean = [w for w in clean_words if w not in to_remove_clean]

        if new_explicit or new_clean:
            filtered_diffs.append((' '.join(new_explicit), ' '.join(new_clean), aligned_word))

    print(f"✅ Filtered {len(diffs) - len(filtered_diffs)} trivial or empty diffs")
    return filtered_diffs


In [40]:
def tokenize(text):
    return re.findall(r"\w+", text.lower())

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0

def find_realign_point(exp_tokens, clean_tokens, exp_idx, clean_idx, window=20):
    for offset in range(window):
        for delta in range(-2, 3):  # small shifts
            e_pos = exp_idx + offset
            c_pos = clean_idx + offset + delta
            if e_pos + 1 < len(exp_tokens) and c_pos + 1 < len(clean_tokens):
                if (exp_tokens[e_pos], exp_tokens[e_pos + 1]) == (clean_tokens[c_pos], clean_tokens[c_pos + 1]):
                    return (e_pos, c_pos, f"{exp_tokens[e_pos]} {exp_tokens[e_pos+1]}")

    # fallback to one-word match
    for offset in range(window):
        e_pos = exp_idx + offset
        if e_pos >= len(exp_tokens):  # prevent out-of-bounds
            break
        word = exp_tokens[e_pos]

        for j in range(clean_idx, min(clean_idx + window, len(clean_tokens))):
            if clean_tokens[j] == word:
                # ensure context window is within bounds
                exp_context = exp_tokens[e_pos + 1 : min(e_pos + 11, len(exp_tokens))]
                clean_context = clean_tokens[j + 1 : min(j + 11, len(clean_tokens))]
                sim = jaccard_similarity(exp_context, clean_context)
                if sim > 0.3:
                    return (e_pos, j, word)

    return (None, None, None)


def is_trivial_diff(exp, clean):
    HOMOPHONES = {"pfft", "pft", "bzz", "kshh", "eugh", "ugh", "ah", "huh", "shhh", "sh"}
    return not exp and not clean or exp in HOMOPHONES or clean in HOMOPHONES

def strict_compare_lyrics(explicit_path, clean_path, out_csv="my_diffs.csv"):
    with open(explicit_path, "r", encoding="utf-8") as f:
        explicit = tokenize(f.read())
    with open(clean_path, "r", encoding="utf-8") as f:
        clean = tokenize(f.read())

    i = j = 0
    diffs = []

    while i < len(explicit) and j < len(clean):
        if explicit[i] == clean[j]:
            i += 1
            j += 1
        else:
            exp_start, clean_start = i, j

            # find next alignment point
            match_i, match_j, matched_word = find_realign_point(explicit, clean, i, j)

            if match_i is None:
                break

            explicit_diff = explicit[exp_start:match_i]
            clean_diff = clean[clean_start:match_j]

            if not is_trivial_diff(' '.join(explicit_diff), ' '.join(clean_diff)):
                diffs.append((' '.join(explicit_diff), ' '.join(clean_diff), matched_word))

            i = match_i
            j = match_j
    
    diffs = clean_diff_variants(diffs)
    print(f"✅ Comparison complete. Total differences found: {len(diffs)}")
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Explicit", "Clean", "Aligned At"])
        writer.writerows(diffs)

    # Print differences
    print("Now looking at song:",test_song)
    for exp, clean, match in diffs:
        print(f"EXPLICIT: {exp}\nCLEAN:    {clean}\nALIGN:    {match}\n")

    return diffs


In [41]:
for test_song in test_songs:
    strict_compare_lyrics("./lyrics/clean/"+test_song+"__explicit_cleaned_normalized.txt", "./lyrics/clean/"+test_song+"__clean_cleaned_normalized.txt")

✅ Filtered 0 trivial or empty diffs
✅ Comparison complete. Total differences found: 9
Now looking at song: Beautiful
EXPLICIT: via mtv
CLEAN:    
ALIGN:    lately been

EXPLICIT: fucking
CLEAN:    
ALIGN:    depressed just

EXPLICIT: fucked
CLEAN:    f
ALIGN:    just stay

EXPLICIT: fucked
CLEAN:    f
ALIGN:    just stay

EXPLICIT: fuckin
CLEAN:    f
ALIGN:    man servant

EXPLICIT: ass
CLEAN:    
ALIGN:    laugh every

EXPLICIT: goddamn
CLEAN:    
ALIGN:    unfortunately am

EXPLICIT: fucked
CLEAN:    f
ALIGN:    just stay

EXPLICIT: fucked
CLEAN:    f
ALIGN:    just stay

✅ Filtered 0 trivial or empty diffs
✅ Comparison complete. Total differences found: 0
Now looking at song: FACK
✅ Filtered 0 trivial or empty diffs
✅ Comparison complete. Total differences found: 11
Now looking at song: Guilty Conscience
EXPLICIT: twentythree
CLEAN:    23
ALIGN:    years old

EXPLICIT: damn
CLEAN:    
ALIGN:    going fucking

EXPLICIT: gotta
CLEAN:    cant
ALIGN:    take this

EXPLICIT: shit
CLEAN: 