## SETUP

In [1]:
# Setup breve: montar Drive, definir rutas y helpers usados por el pipeline.
try:
    from google.colab import drive
    drive.mount("/content/drive")
except Exception:
    pass

import os, re, json
import pandas as pd
import numpy as np

# Rutas (I/O)
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/TFG"
RAW_CSV  = os.path.join(BASE_DIR, "chordonomicon.csv")
OUT_DIR  = os.path.join(BASE_DIR, "Archivos preprocesamiento")
os.makedirs(OUT_DIR, exist_ok=True)

# Constantes de preprocesado
MAX_LEN = 112  # longitud fija de secuencia para el Transformer

# ---- Helpers de limpieza/tokenización ----
def remove_section_tags(s: str) -> str:
    # Elimina etiquetas <intro_1>, <chorus_2>, etc.
    return re.sub(r"<[^>]+>", "", str(s)).strip()

def tokenize_chords(s: str):
    # Tokeniza por espacios a lista de acordes.
    return str(s).strip().split()

def normalize_chord_notation(ch: str) -> str:
    # Normalización de notación (idéntica al original).
    return ch.replace("min", "m").replace("maj", "M").replace("s", "#")

# ---- Vocabulario simplificado + filtro y manejo de slash chords ----
ROOTS     = ['C','C#','Db','D','D#','Eb','E','F','F#','Gb','G','G#','Ab','A','A#','Bb','B']
QUALITIES = ['', 'm', 'dim', 'aug']
EXTENSIONS= ['', '6', '7', 'maj7', 'sus2', 'sus4', 'add9']
valid_chords = set(r + q + e for r in ROOTS for q in QUALITIES for e in EXTENSIONS)

def filter_valid_or_simplify_chords(chord_list):
    # Reduce slash chords a su raíz; filtra solo acordes válidos.
    cleaned = []
    for chord in chord_list:
        if "/" in chord:
            chord = chord.split("/")[0]
        if chord in valid_chords:
            cleaned.append(chord)
    return cleaned

# ---- Objetos de acorde (estructura mínima) ----
class ExtendedChord:
    # Representa root + quality ('m' si aplica) + extension (resto).
    def __init__(self, chord_str: str):
        self.original = chord_str
        m = re.match(r"^([A-G][b#]?)(.*)", chord_str)
        if m:
            self.root = m.group(1)
            remainder = m.group(2)
            if remainder.startswith("m"):
                self.quality = "m"
                self.extension = remainder[1:]
            else:
                self.quality = ""
                self.extension = remainder
        else:
            self.root = self.quality = self.extension = None
    def __repr__(self):
        return f"{self.root}{self.quality}{self.extension}"

def to_chord_objects(chord_list):
    # Convierte lista de strings a objetos válidos ExtendedChord.
    out = []
    for ch in chord_list:
        obj = ExtendedChord(ch)
        if obj.root:
            out.append(obj)
    return out

# ---- Tonalidad media y transposición al eje de Do ----
NOTES_SHARP    = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']
NOTE_TO_VALUE  = {n: i for i, n in enumerate(NOTES_SHARP)}
NOTE_TO_VALUE.update({'Db':1,'Eb':3,'Gb':6,'Ab':8,'Bb':10})
VALUE_TO_NOTE  = {i: n for i, n in enumerate(NOTES_SHARP)}

def harmonic_mean_val(chord_objs):
    # Media aritmética de raíces (0–11), redondeada mod 12.
    vals = [NOTE_TO_VALUE.get(ch.root) for ch in chord_objs if NOTE_TO_VALUE.get(ch.root) is not None]
    if not vals:
        return None
    return int(round(sum(vals) / len(vals))) % 12

def transpose_chord_obj(ch: ExtendedChord, interval: int):
    # Transpone un acorde 'interval' semitonos hacia abajo.
    rv = NOTE_TO_VALUE.get(ch.root)
    if rv is None:
        return None
    new_root = VALUE_TO_NOTE[(rv - interval) % 12]
    return f"{new_root}{ch.quality}{ch.extension}"

def transpose_song(chord_objs, interval: int):
    # Transpone toda la canción al eje de Do (si hay media).
    if interval is None:
        return []
    return [c for c in (transpose_chord_obj(ch, interval) for ch in chord_objs) if c is not None]

# ---- Vocab y utilidades de codificación ----
def build_vocab(normalised_lists):
    # Vocab con tokens especiales al inicio.
    all_chords = sorted(set(ch for song in normalised_lists for ch in song))
    vocab = ['[PAD]', '[UNK]'] + all_chords
    chord_to_idx = {ch: i for i, ch in enumerate(vocab)}
    idx_to_chord = {i: ch for ch, i in chord_to_idx.items()}
    return chord_to_idx, idx_to_chord

def encode_and_pad(seq, chord_to_idx, max_len=MAX_LEN):
    # Indiza, trunca o paducea a longitud fija.
    ids = [chord_to_idx.get(ch, chord_to_idx['[UNK]']) for ch in seq]
    return ids + [chord_to_idx['[PAD]']] * (max_len - len(ids)) if len(ids) < max_len else ids[:max_len]

def create_target(seq, pad_idx):       # y_t = x_{t+1} + PAD final
    return seq[1:] + [pad_idx]

def attention_mask_from(seq, pad_idx): # 1=token real, 0=PAD
    return [1 if tok != pad_idx else 0 for tok in seq]


Mounted at /content/drive


## PREPROCESAMIENTO

In [2]:
# Pipeline: carga CSV crudo → limpia/normaliza → objetos + tonalidad → transposición a Do → exporta features y artefactos Transformer.

# 1) Carga y depuración mínima de filas/columnas
df0 = pd.read_csv(RAW_CSV)
df = df0.dropna(subset=["genres", "decade", "main_genre"]).copy()
df.drop(columns=["release_date","rock_genre","artist_id","spotify_song_id","spotify_artist_id"],
        errors="ignore", inplace=True)

# 2) Limpieza + tokenización + normalización + filtro
df["chords"] = df["chords"].apply(remove_section_tags).apply(tokenize_chords)
df["chords"] = df["chords"].apply(lambda lst: [normalize_chord_notation(ch) for ch in lst])
df["chords"] = df["chords"].apply(filter_valid_or_simplify_chords)

# 3) A objetos, tonalidad media y transposición al eje de Do
df["chord_objects"]    = df["chords"].apply(to_chord_objects)
df["harmonic_mean"]    = df["chord_objects"].apply(harmonic_mean_val)
df["normalised_chords"] = df.apply(lambda r: transpose_song(r["chord_objects"], r["harmonic_mean"]), axis=1)

# 4) Export base: all_features.csv
base_cols = ["id","chords","normalised_chords","genres","main_genre","decade"]
df_base = df[base_cols].copy()
df_base.to_csv(os.path.join(OUT_DIR, "all_features.csv"), index=False)

# 5) Vocab + codificación fija (MAX_LEN=112): JSONs + CSV/Parquet finales
chord_to_idx, idx_to_chord = build_vocab(df["normalised_chords"])
with open(os.path.join(OUT_DIR, "chord_to_idx.json"), "w", encoding="utf-8") as f:
    json.dump(chord_to_idx, f)
with open(os.path.join(OUT_DIR, "idx_to_chord.json"), "w", encoding="utf-8") as f:
    json.dump(idx_to_chord, f)

pad_idx = chord_to_idx["[PAD]"]
df["encoded_chords"]  = df["normalised_chords"].apply(lambda seq: encode_and_pad(seq, chord_to_idx, MAX_LEN))
df["target_chords"]   = df["encoded_chords"].apply(lambda seq: create_target(seq, pad_idx))
df["attention_mask"]  = df["encoded_chords"].apply(lambda seq: attention_mask_from(seq, pad_idx))

final_cols = ["id","normalised_chords","encoded_chords","target_chords","attention_mask","genres","main_genre","decade"]
df_final = df[final_cols].copy()
df_final.to_csv   (os.path.join(OUT_DIR, "all_features_transformer.csv"), index=False)
df_final.to_parquet(os.path.join(OUT_DIR, "all_features_transformer.parquet"), index=False)


  df0 = pd.read_csv(RAW_CSV)
