In [1]:
import os

os.chdir(r"C:\Users\Gabriel\OneDrive\Escritorio\6to_Semestre\Machine_Learning\NBA\nba")
print(os.getcwd())  # Verifica que estás en la raíz


C:\Users\Gabriel\OneDrive\Escritorio\6to_Semestre\Machine_Learning\NBA\nba


In [2]:
# Kedro bootstrap robusto: detecta la raíz y evita MissingConfigException
from pathlib import Path
from kedro.framework.project import configure_project
from kedro.framework.session import KedroSession

# Detecta raíz del proyecto si el notebook está en /notebooks o en la raíz
cwd = Path.cwd()
if (cwd / 'pyproject.toml').exists() and (cwd / 'src' / 'nba').exists():
    project_path = cwd
elif (cwd.parent / 'pyproject.toml').exists() and (cwd.parent / 'src' / 'nba').exists():
    project_path = cwd.parent
else:
    project_path = Path(r"C:\Users\Gabriel\OneDrive\Escritorio\6to_Semestre\Machine_Learning\NBA\nba")

# Inicializa el proyecto ANTES de crear la sesión
configure_project("nba")

with KedroSession.create(project_path=project_path) as session:
    context = session.load_context()
    catalog = context.catalog

print(f"Kedro OK. project_path={project_path}")

Kedro OK. project_path=C:\Users\Gabriel\OneDrive\Escritorio\6to_Semestre\Machine_Learning\NBA\nba


In [11]:
import os
os.path.exists("data/01_raw/player.csv")

[3;92mTrue[0m

In [15]:
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import display

# 1) Detectar raíz de datos (soporta notebook en raíz o en /notebooks)
cwd = Path.cwd()
if (cwd / "data").exists():
    data_root = cwd / "data"
elif (cwd.parent / "data").exists():
    data_root = cwd.parent / "data"
else:
    data_root = Path(r"C:\Users\Gabriel\OneDrive\Escritorio\6to_Semestre\Machine_Learning\NBA\nba\data")

raw_dir = data_root / "01_raw"
clean_dir = data_root / "02_intermediate" / "cleaned"
clean_dir.mkdir(parents=True, exist_ok=True)

# 2) Utilidades de limpieza y resumen
def standardize_column_names(columns):
    return [c.strip().lower().replace(" ", "_") for c in columns]

def try_parse_dates(df: pd.DataFrame) -> pd.DataFrame:
    # Coerce evita errores y convierte inválidos a NaT
    date_like_cols = [c for c in df.columns if "date" in c or "fecha" in c or c.endswith("_dt")]
    for col in date_like_cols:
        if col in df.columns and df[col].dtype == object:
            df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
    return df

def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Evita error si el CSV está vacío o sin columnas
    if df.shape[1] == 0:
        return df
    df.columns = standardize_column_names(df.columns)
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .replace({"": np.nan, "nan": np.nan, "None": np.nan})
            )
    df = try_parse_dates(df)
    df = df.drop_duplicates()
    return df

def df_memory_mb(df: pd.DataFrame) -> float:
    return df.memory_usage(deep=True).sum() / (1024 ** 2)

def summarize_dataframe(df: pd.DataFrame) -> dict:
    rows, cols = df.shape
    if rows == 0 or cols == 0:
        return {
            "rows": rows, "cols": cols, "missing_total": 0, "missing_pct": 0.0,
            "duplicate_rows": 0, "memory_mb": round(df_memory_mb(df), 3),
            "num_cols": 0, "cat_cols": 0
        }
    missing_total = int(df.isna().sum().sum())
    missing_pct = float((missing_total / (rows * cols)) * 100)
    dup_rows = int(rows - df.drop_duplicates().shape[0])
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    return {
        "rows": rows,
        "cols": cols,
        "missing_total": missing_total,
        "missing_pct": round(missing_pct, 2),
        "duplicate_rows": dup_rows,
        "memory_mb": round(df_memory_mb(df), 3),
        "num_cols": len(num_cols),
        "cat_cols": len(cat_cols),
    }

# Robustez de lectura CSV (encoding, separador, filas malas)
def robust_read_csv(path: Path) -> pd.DataFrame:
    # 1) intento rápido utf-8
    try:
        return pd.read_csv(path, low_memory=False)
    except Exception:
        pass
    # 2) engine='python' y sep=None para detectar delimitador
    try:
        return pd.read_csv(path, sep=None, engine="python", low_memory=False, on_bad_lines="skip")
    except Exception:
        pass
    # 3) probar encodings comunes
    for enc in ["utf-8", "latin-1", "cp1252"]:
        try:
            return pd.read_csv(path, encoding=enc, sep=None, engine="python", low_memory=False, on_bad_lines="skip")
        except Exception:
            continue
    # 4) último recurso: coma fija y latin-1
    return pd.read_csv(path, encoding="latin-1", sep=",", low_memory=False, engine="python", on_bad_lines="skip")

# 3) Cargar todos los CSV de 01_raw (recursivo) y hacer EDA + limpieza
csv_files = list(raw_dir.rglob("*.csv"))
if not csv_files:
    print(f"No se encontraron CSV en: {raw_dir}")
else:
    summaries = []
    for path in csv_files:
        try:
            df = robust_read_csv(path)

            # Si el archivo está vacío, no intentar limpiar
            if df is None or (df.shape[0] == 0 and df.shape[1] == 0):
                summaries.append({
                    "file": str(path.relative_to(data_root)),
                    "error": "Archivo vacío o sin columnas"
                })
                continue

            summary_before = summarize_dataframe(df)
            df_clean = basic_clean(df)
            summary_after = summarize_dataframe(df_clean)

            # Guardar limpio, preservando subcarpetas
            rel = path.relative_to(raw_dir)
            out_path = clean_dir / rel
            out_path.parent.mkdir(parents=True, exist_ok=True)
            df_clean.to_csv(out_path, index=False)

            summaries.append({
                "file": str(path.relative_to(data_root)),
                **{f"before_{k}": v for k, v in summary_before.items()},
                **{f"after_{k}": v for k, v in summary_after.items()},
                "clean_saved_to": str(out_path.relative_to(data_root))
            })
        except Exception as e:
            summaries.append({
                "file": str(path.relative_to(data_root)),
                "error": str(e)
            })

    summary_df = pd.DataFrame(summaries)
    preferred_cols = [
        "file",
        "before_rows", "before_cols", "before_missing_total", "before_missing_pct",
        "before_duplicate_rows", "before_memory_mb",
        "after_rows", "after_cols", "after_missing_total", "after_missing_pct",
        "after_duplicate_rows", "after_memory_mb",
        "clean_saved_to", "error"
    ]
    summary_df = summary_df[[c for c in preferred_cols if c in summary_df.columns]]
    display(summary_df)

Unnamed: 0,file,before_rows,before_cols,before_missing_total,before_missing_pct,before_duplicate_rows,before_memory_mb,after_rows,after_cols,after_missing_total,after_missing_pct,after_duplicate_rows,after_memory_mb,clean_saved_to
0,01_raw\common_player_info.csv,4171,33,4481,3.26,0,6.1,4171,33,4490,3.26,0,5.861,02_intermediate\cleaned\common_player_info.csv
1,01_raw\draft_combine_stats.csv,1202,47,28183,49.89,0,1.564,1202,47,28183,49.89,0,1.564,02_intermediate\cleaned\draft_combine_stats.csv
2,01_raw\draft_history.csv,7990,14,38,0.03,0,3.536,7990,14,38,0.03,0,3.536,02_intermediate\cleaned\draft_history.csv
3,01_raw\game.csv,65698,55,432747,11.98,0,59.073,65698,55,432747,11.98,0,55.314,02_intermediate\cleaned\game.csv
4,01_raw\game_info.csv,58053,4,33491,14.42,40,7.022,58013,4,33435,14.41,0,4.14,02_intermediate\cleaned\game_info.csv
5,01_raw\game_summary.csv,58110,14,159511,19.61,40,20.109,58070,14,161464,19.86,0,17.122,02_intermediate\cleaned\game_summary.csv
6,01_raw\inactive_players.csv,110191,9,45,0.0,7,32.256,110184,9,45,0.0,0,33.094,02_intermediate\cleaned\inactive_players.csv
7,01_raw\line_score.csv,58053,43,795333,31.86,40,42.8,58013,43,794787,31.86,0,39.893,02_intermediate\cleaned\line_score.csv
8,01_raw\officials.csv,70971,5,190,0.05,30,9.008,70941,5,190,0.05,0,9.546,02_intermediate\cleaned\officials.csv
9,01_raw\other_stats.csv,28271,26,8878,1.21,10,10.636,28261,26,8870,1.21,0,10.847,02_intermediate\cleaned\other_stats.csv
