# 01 — EDA y Preprocesamiento (MAL 2020)

Este cuaderno carga los **CSV** del dataset *Anime Recommendation Database 2020* de Kaggle (o desde disco), 
normaliza columnas, deriva `rating_complete` cuando no está disponible, y guarda derivados en `data/processed/`.

In [42]:
# Rutas y utilidades de importación
import sys
from pathlib import Path
repo_root = Path().resolve().parent if Path.cwd().name == "notebooks" else Path().resolve()
sys.path.insert(0, str(repo_root))

print("Repo root:", repo_root)


Repo root: C:\Users\enman\Downloads\COLFONDOS\DMC\anime-recomendation


In [43]:
# Robust CSV loader with multiple fallbacks
import io
from pathlib import Path
import polars as pl
import pandas as pd

def read_csv_robust(path: Path) -> pl.DataFrame:
    # 1) Fast path: Polars UTF-8
    try:
        return pl.read_csv(path, infer_schema_length=0)
    except Exception:
        pass

    # 2) Polars with 'utf8-lossy' (replaces invalid UTF-8 sequences)
    try:
        return pl.read_csv(path, infer_schema_length=0, encoding="utf8-lossy")
    except Exception:
        pass

    # 3) Manual decode of raw bytes with ISO-8859-1/latin1 (always succeeds) + Polars
    raw = Path(path).read_bytes()
    # choose 'replace' (keep length) or 'ignore' (drop bad bytes). Either avoids exceptions.
    txt = raw.decode("latin1", errors="replace")
    try:
        return pl.read_csv(io.StringIO(txt), infer_schema_length=0)
    except Exception:
        # 4) Last resort: pandas python engine (no low_memory flag here!)
        return pl.from_pandas(
            pd.read_csv(io.StringIO(txt), engine="python", on_bad_lines="skip")
        )


In [44]:
import io, zipfile
from pathlib import Path
import polars as pl
import pandas as pd
import kagglehub

DS = "hernan4444/anime-recommendation-database-2020"
raw_dir = Path("data/raw/mal2020")
raw_dir.mkdir(parents=True, exist_ok=True)

def _read_csv_from_bytes(raw: bytes) -> pl.DataFrame:
    # 1) Polars UTF-8
    try:
        return pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
    except Exception:
        pass
    # 2) Polars con utf8-lossy
    try:
        return pl.read_csv(io.BytesIO(raw), infer_schema_length=0, encoding="utf8-lossy")
    except Exception:
        pass
    # 3) Decodificar a texto latin1 (siempre mapea) y volver a leer
    txt = raw.decode("latin1", errors="replace")
    try:
        return pl.read_csv(io.StringIO(txt), infer_schema_length=0)
    except Exception:
        # 4) Último recurso: pandas (sin low_memory)
        return pl.from_pandas(pd.read_csv(io.StringIO(txt), engine="python", on_bad_lines="skip"))

def dl_and_read(fname: str, force=False) -> pl.DataFrame:
    # bajar al caché local de kagglehub (puede ser ZIP aunque pidas .csv)
    local_path = kagglehub.dataset_download(DS, path=fname, force_download=force)
    p = Path(local_path)

    if zipfile.is_zipfile(p):
        # Abrir el zip y extraer el miembro que coincide con fname (o el primer .csv)
        with zipfile.ZipFile(p) as z:
            # buscar coincidencia exacta por nombre (ignora subcarpetas)
            cands = [m for m in z.namelist() if Path(m).name.lower() == fname.lower()]
            if not cands:
                cands = [m for m in z.namelist() if m.lower().endswith(".csv")]
            member = cands[0]
            raw = z.read(member)
            df = _read_csv_from_bytes(raw)
    else:
        raw = p.read_bytes()
        df  = _read_csv_from_bytes(raw)

    # cachear una copia “limpia” en tu proyecto para próximos runs
    target = raw_dir / fname
    target.parent.mkdir(parents=True, exist_ok=True)
    df.write_csv(target)
    return df


In [45]:
anime     = dl_and_read("anime.csv", force=False)
animelist = dl_and_read("animelist.csv", force=False)
synopsis  = dl_and_read("anime_with_synopsis.csv", force=False)

for name, df in {"anime": anime, "animelist": animelist, "synopsis": synopsis}.items():
    print(name, df.shape, df.columns[:8])
    print(df.head(2))


anime (17562, 35) ['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name', 'Type', 'Episodes']
shape: (2, 35)
┌────────┬───────────────┬───────┬─────────────────────┬───┬─────────┬─────────┬─────────┬─────────┐
│ MAL_ID ┆ Name          ┆ Score ┆ Genres              ┆ … ┆ Score-4 ┆ Score-3 ┆ Score-2 ┆ Score-1 │
│ ---    ┆ ---           ┆ ---   ┆ ---                 ┆   ┆ ---     ┆ ---     ┆ ---     ┆ ---     │
│ str    ┆ str           ┆ str   ┆ str                 ┆   ┆ str     ┆ str     ┆ str     ┆ str     │
╞════════╪═══════════════╪═══════╪═════════════════════╪═══╪═════════╪═════════╪═════════╪═════════╡
│ 1      ┆ Cowboy Bebop  ┆ 8.78  ┆ Action, Adventure,  ┆ … ┆ 3184.0  ┆ 1357.0  ┆ 741.0   ┆ 1580.0  │
│        ┆               ┆       ┆ Comedy, Dra…        ┆   ┆         ┆         ┆         ┆         │
│ 5      ┆ Cowboy Bebop: ┆ 8.39  ┆ Action, Drama,      ┆ … ┆ 577.0   ┆ 221.0   ┆ 109.0   ┆ 379.0   │
│        ┆ Tengoku no    ┆       ┆ Mystery, Sci-Fi…    ┆   ┆        