In [None]:
# 1_cleaning.ipynb cell 1 â€” Load + canonicalize anime.csv
from pathlib import Path
import pandas as pd
import re

ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
RAW = ROOT / "data" / "raw"
PROC = ROOT / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

anime_path = RAW / "anime.csv"
print("Loading:", anime_path)
anime = pd.read_csv(anime_path, low_memory=False)

# Auto-detect columns
cols = [c.lower() for c in anime.columns]
title_col = next((c for c in anime.columns if c.lower() in ("name","title")), anime.columns[0])
score_col = next((c for c in anime.columns if c.lower() in ("score","rating")), None)
genre_col = next((c for c in anime.columns if "genre" in c.lower()), None)
episodes_col = next((c for c in anime.columns if "episode" in c.lower()), None)
members_col = next((c for c in anime.columns if "member" in c.lower() or "popularity" in c.lower()), None)

print("Detected -> title:", title_col, "score:", score_col, "genre:", genre_col, 
      "episodes:", episodes_col, "members/popularity:", members_col)

# Normalize title
def normalize_title(s):
    if pd.isna(s): return ""
    s = str(s)
    s = s.strip()
    s = s.lower()
    s = re.sub(r'[^a-z0-9 ]','', s)
    return " ".join(s.split())

anime["title_norm"] = anime[title_col].astype(str).apply(normalize_title)

# Standardize numeric fields
if score_col:
    anime["score"] = pd.to_numeric(anime[score_col], errors="coerce")
else:
    # if no score column, create placeholder (won't run modeling)
    anime["score"] = pd.NA

if episodes_col:
    anime["episodes"] = pd.to_numeric(anime[episodes_col], errors="coerce")
else:
    anime["episodes"] = pd.NA

if members_col:
    anime["members"] = pd.to_numeric(anime[members_col], errors="coerce")
else:
    anime["members"] = pd.NA

# Collapse genre to first genre (you'll later one-hot the top k)
if genre_col:
    anime["main_genre"] = anime[genre_col].astype(str).str.split(",").str[0].str.strip()
else:
    anime["main_genre"] = "Unknown"

# Drop duplicate titles, drop rows with no score (for modeling)
anime = anime.drop_duplicates(subset="title_norm").reset_index(drop=True)
anime_model = anime.dropna(subset=["score"]).copy()

# Save cleaned base files
anime.to_csv(PROC / "anime_clean.csv", index=False, encoding="utf-8")
anime_model.to_csv(PROC / "anime_model_ready.csv", index=False, encoding="utf-8")

print("Saved:", PROC / "anime_clean.csv", PROC / "anime_model_ready.csv")
print("anime_model_ready shape:", anime_model.shape)
anime_model.head(3)
