In [53]:
# Importe
import json
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pprint
import json
import spacy

In [54]:
# Dateipfade
metadata_path = "../data/raw/metadata/metadata.json"
lyrics_path = "../data/raw/lyrics/lyrics.json"

# Pfad zum Ordner
folder_path = "../data/processed"
os.makedirs(folder_path, exist_ok=True)

In [55]:
# Rohdaten laden
with open(metadata_path, "r", encoding="utf-8") as f:
    metadata = json.load(f)

with open(lyrics_path, "r", encoding="utf-8") as f:
    lyrics = json.load(f)

PREPROCESSING

In [56]:
# Eigene Rap-Adlibs
custom_adlibs = [
    "ey", "yo", "ah", "hm", "uh", "haha", "he", "hey", "ho", "huh", "haa", "hoho", 
    "woo", "yea", "yuh", "ay", "oh", "ayyaye", "ya", "ey", "ough", "skrr", "bam", 
    "pow", "bang", "brr", "choo", "bla", "bluh", "woa", "ye", "ya", "ayyy", "ayyyyo", 
    "yo", "ye", "ha", "äh", "ehm"
]
# Stopwords inkl. Adlibs
stop_words = set(stopwords.words('german') + stopwords.words('english') + custom_adlibs)

In [57]:
# Bereinigung der Lyrics 
def clean_lyrics(text):
    text = re.sub(r'\s+', ' ', text)      # Mehrfache Leerzeichen -> 1 Leerzeichen
    text = re.sub(r'[^\w\s]', '', text)   # Sonderzeichen entfernen
    text = text.lower()                    # Alles in Kleinbuchstaben
    return text.strip()

In [58]:
# Entfernt alle Tokens, die mit einem Adlib beginnen
def remove_adlib_tokens(tokens):
    cleaned_tokens = []
    for t in tokens:
        if not any(t.lower().startswith(adlib) for adlib in custom_adlips):
            cleaned_tokens.append(t)
    return cleaned_tokens

In [59]:
# Stopwords + Adlips entfernen
def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

In [60]:
# Tokenisierung
def tokenize_lyrics(text):
    return word_tokenize(text)

In [61]:
# Lemmatisierung
nlp = spacy.load("de_core_news_sm")

def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc if token.text.strip()]

In [62]:
# DataFrame erzeugen + Preprocessing
def create_song_dataframe(metadata, lyrics):
    rows = []
    for artist_songs in metadata["artists"].values():
        for song in artist_songs:
            sid = song["song_id"]
            raw_lyrics = lyrics.get(sid, {}).get("lyrics", "")
            cleaned = clean_lyrics(raw_lyrics)
            
            tokens = tokenize_lyrics(cleaned)
            tokens_no_adlibs = remove_adlib_tokens(tokens)
            tokens_no_stop = remove_stopwords(tokens)
            lemmas = lemmatize_tokens(tokens_no_stop)
            
            rows.append({
                "song_id": sid,
                "artist_name": song["artist_name"],
                 "song_title": song["song_title"],
                "album_name": song["album_name"],
                "year": song["release_date"],
                "song_url": song["song_url"],
                "album_url": song["album_url"],
                "lyrics_clean": cleaned,
                "lyrics_tokens": tokens_no_stop,
                "lyrics_lemmas": lemmas
            })
    
    df = pd.DataFrame(rows)
    return df

In [63]:
# DataFrame erstellen
df_songs = create_song_dataframe(metadata, lyrics)

In [64]:
# Vorschau
preview = df_songs.head(3).to_dict(orient="records")

# Ausgabe
print(json.dumps(preview, indent=2, ensure_ascii=False))

[
  {
    "song_id": "63d930f4-022f-4212-bf8f-ff3e6b343b13",
    "artist_name": "DJ Reckless",
    "song_title": "Banger Spieler Atzen Dealer",
    "album_name": "Unknown",
    "year": "Unknown",
    "song_url": "https://genius.com/Dj-reckless-banger-spieler-atzen-dealer-lyrics",
    "album_url": null,
    "lyrics_clean": "banger spieler atzen dealer anzug von fila und das bare ist lila banger spieler atzen dealer anzug von fila und das bare ist lila banger spieler atzen dealer anzug von fila und das bare ist lila banger spieler atzen dealer anzug von fila und das bare ist lila reckless mein name bass ohne gnade pushe heiße ware ich mache das bare yeah stell mir bеsser keine fragеn scheine sind lila digga schreibe schwarze zahlen mache schnelles cash mit dem mega mega bass die nutten lieben es und neider schieben hass yo yo du weißt genau dass der scheiß bangt shababs gehen bis der club abbrennt corner an der avenue tempelhofer damm wo die jungs im viertel hängen und die schwarzen benz

In [65]:
# Speicherpfad
output_path = os.path.join(folder_path, "processed_songs.json")

In [66]:
# Alle Songs speichern
df_songs.to_json(output_path, orient="records", force_ascii=False, indent=2)

print(f"JSON gespeichert unter: {output_path}")

JSON gespeichert unter: ../data/processed\processed_songs.json
