In [66]:
import json
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pprint
import json

In [67]:
metadata_path = "../data/raw/metadata/metadata.json"
lyrics_path = "../data/raw/lyrics/lyrics.json"

In [68]:
# Rohdaten laden
with open(metadata_path, "r", encoding="utf-8") as f:
    metadata = json.load(f)

with open(lyrics_path, "r", encoding="utf-8") as f:
    lyrics = json.load(f)

In [69]:
# Bereinigung der Lyrics 
def clean_lyrics(text):
    text = re.sub(r'\s+', ' ', text)      # Mehrfache Leerzeichen -> 1 Leerzeichen
    text = re.sub(r'[^\w\s]', '', text)   # Sonderzeichen entfernen
    text = text.lower()                    # Alles in Kleinbuchstaben
    return text.strip()

In [70]:
# Tokenisierung
def tokenize_lyrics(text):
    return word_tokenize(text)

In [71]:
# Stopwords entfernen
stop_words = set(stopwords.words('german') + stopwords.words('english'))
def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

In [74]:
# DataFrame erzeugen + Preprocessing
def create_song_dataframe(metadata, lyrics):
    rows = []
    for artist_songs in metadata["artists"].values():
        for song in artist_songs:
            sid = song["song_id"]
            raw_lyrics = lyrics.get(sid, {}).get("lyrics", "")
            cleaned = clean_lyrics(raw_lyrics)
            tokens = tokenize_lyrics(cleaned)
            tokens_no_stop = remove_stopwords(tokens)
            
            rows.append({
                "song_id": sid,
                "artist": song["artist_name"],
                 "song_title": song["song_title"],
                "album_name": song["album_name"],
                "year": song["release_date"],
                "song_url": song["song_url"],
                "album_url": song["album_url"],
                "lyrics_raw": raw_lyrics,
                "lyrics_clean": cleaned,
                "lyrics_tokens": tokens_no_stop
            })
    
    df = pd.DataFrame(rows)
    return df

In [75]:
# DataFrame erstellen
df_songs = create_song_dataframe(metadata, lyrics)

In [76]:
# Pfad zum Ordner
folder_path = "../data/processed"
os.makedirs(folder_path, exist_ok=True)

In [77]:
# Vorschau
preview = df_songs.head(3).to_dict(orient="records")

# Ausgabe
print(json.dumps(preview, indent=2, ensure_ascii=False))

[
  {
    "song_id": "71bb17b7-2473-4802-b952-2a2c1dcff609",
    "artist": "Capital Bra",
    "song_title": "One Night Stand",
    "album_name": "Berlin lebt",
    "year": "June 22, 2018",
    "song_url": "https://genius.com/Capital-bra-one-night-stand-lyrics",
    "album_url": "https://genius.com/albums/Capital-bra/Berlin-lebt",
    "lyrics_raw": "Na-na-na-na Na-na-na, na-na-na Na-na-na-na Na-na-na, na-na-na Ty moja ljubimaja Takaja diwnaja, krasiwaja-ja Sprawjedliwaja, njepobjedimaja Ty moja-ja, baby, ty moja-ja Ty moja ljubimaja Takaja diwnaja, krasiwaja-ja Sprawjedliwaja, njepobjedimaja Ty moja-ja, baby, ty moja-ja Baby, komm, steig ein und wir fahr'n durch die Stadt Das Outfit, das du heute trägst, verwirrt meinen Verstand Ja, sie regt mich manchmal auf und macht mich manchmal krank Aber jeden meiner Gucci-Pullis wäscht sie mit der Hand (Na-na-na) Komm, wir fahr'n ins Hotel, Baby, lass uns einfach rumliegen Wir rauchen Joints, gucken Netflix, ich bin zufrieden Manchma' sind wir la

In [78]:
# Speicherpfad
output_path = os.path.join(folder_path, "songs.json")

In [79]:
# Alle Songs speichern
df_songs.to_json(output_path, orient="records", force_ascii=False, indent=2)

print(f"JSON gespeichert unter: {output_path}")

JSON gespeichert unter: ../data/processed\songs.json
