## Imports

In [1]:
import os, json, platform, random, numpy as np, pandas as pd
from datetime import datetime

## Setup

In [2]:
RAW_DIR   = "../data"
CLEAN_DIR = "../data/clean"
os.makedirs(CLEAN_DIR, exist_ok=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED)

print("Python:", platform.python_version())

Python: 3.11.14


## Load

In [3]:
train_csv = os.path.join(RAW_DIR, "train.csv")
test_csv  = os.path.join(RAW_DIR, "test.csv")   # se existir

df_train = pd.read_csv(train_csv)
df_test  = pd.read_csv(test_csv) if os.path.exists(test_csv) else None

print("Train shape:", df_train.shape)
if df_test is not None: print("Test shape :", df_test.shape)

Train shape: (891, 12)
Test shape : (418, 11)


## Basic quality report

In [4]:
def quality_report(df: pd.DataFrame, label="df"):
    print(f"\n== {label} dtypes ==")
    print(df.dtypes)
    miss = df.isna().sum().sort_values(ascending=False)
    print(f"\n== {label} missing values ==")
    print(pd.DataFrame({"n_missing": miss, "pct": (miss/len(df))*100}).head(20))
    print(f"\n== {label} categorical cardinality ==")
    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    card = {c: df[c].nunique(dropna=True) for c in cat_cols}
    print(pd.Series(card).sort_values(ascending=False).head(20))

quality_report(df_train, "train")
if df_test is not None:
    quality_report(df_test, "test")


== train dtypes ==
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

== train missing values ==
             n_missing        pct
Cabin              687  77.104377
Age                177  19.865320
Embarked             2   0.224467
PassengerId          0   0.000000
Name                 0   0.000000
Pclass               0   0.000000
Survived             0   0.000000
Sex                  0   0.000000
Parch                0   0.000000
SibSp                0   0.000000
Fare                 0   0.000000
Ticket               0   0.000000

== train categorical cardinality ==
Name        891
Ticket      681
Cabin       147
Embarked      3
Sex           2
dtype: int64

== test dtypes ==
PassengerId      int64
Pclass           int64
Name            object
Se

## Schema & presence checks

In [5]:
def check_train_schema(df: pd.DataFrame, target: str) -> None:
    assert target in df.columns, f"Missing target: {target}"
    # columns your engineer_features needs:
    required_for_eng = {"SibSp", "Parch"}  # add others if used inside engineer_features
    miss = [c for c in required_for_eng if c not in df.columns]
    if miss:
        raise ValueError(f"Missing columns needed for feature engineering: {miss}")
    # optional soft checks
    num_hint = ["Pclass","Age","SibSp","Parch","Fare"]
    cat_hint = ["Sex","Embarked"]
    warn = []
    for c in num_hint:
        if c in df and not pd.api.types.is_numeric_dtype(df[c]):
            warn.append(f"{c} not numeric")
    for c in cat_hint:
        if c in df and pd.api.types.is_numeric_dtype(df[c]):
            warn.append(f"{c} looks numeric but expected categorical")
    if warn:
        print("[train schema warnings]", "; ".join(warn))

def align_for_inference(df_raw: pd.DataFrame, clf) -> pd.DataFrame:
    df = engineer_features(df_raw.copy())
    df = df.drop(columns=["PassengerId","Name","Ticket","Cabin","Title"], errors="ignore")
    expected = clf.named_steps["preprocess"].feature_names_in_
    # add missing / drop extras / order
    for c in expected:
        if c not in df.columns:
            df[c] = pd.NA
    df = df.loc[:, list(expected)]
    # light dtype coercion
    num = {'Pclass','Age','SibSp','Parch','Fare','FamilySize'}
    cat = {'Sex','Embarked'}
    for c in num & set(df.columns):
        df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in cat & set(df.columns):
        df[c] = df[c].astype("string")
    return df


## Basic quality report

In [6]:
def quality_report(df: pd.DataFrame, label="df"):
    print(f"\n== {label} dtypes ==")
    print(df.dtypes)
    miss = df.isna().sum().sort_values(ascending=False)
    print(f"\n== {label} missing values ==")
    print(pd.DataFrame({"n_missing": miss, "pct": (miss/len(df))*100}).head(20))
    print(f"\n== {label} categorical cardinality ==")
    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    card = {c: df[c].nunique(dropna=True) for c in cat_cols}
    print(pd.Series(card).sort_values(ascending=False).head(20))

quality_report(df_train, "train")
if df_test is not None:
    quality_report(df_test, "test")


== train dtypes ==
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

== train missing values ==
             n_missing        pct
Cabin              687  77.104377
Age                177  19.865320
Embarked             2   0.224467
PassengerId          0   0.000000
Name                 0   0.000000
Pclass               0   0.000000
Survived             0   0.000000
Sex                  0   0.000000
Parch                0   0.000000
SibSp                0   0.000000
Fare                 0   0.000000
Ticket               0   0.000000

== train categorical cardinality ==
Name        891
Ticket      681
Cabin       147
Embarked      3
Sex           2
dtype: int64

== test dtypes ==
PassengerId      int64
Pclass           int64
Name            object
Se

## Cleaning & typing (functions to reuse)

In [7]:
def clean_frame(df: pd.DataFrame, *, target: str | None = None,
                     categorical_keep: set[str] = frozenset({"Sex","Embarked"})) -> pd.DataFrame:
    df = df.copy()

    # strip
    obj_like = df.select_dtypes(include=["object","string"]).columns
    for c in obj_like:
        df[c] = df[c].astype("string").str.strip()

    # try numeric on object-like except known categoricals + target
    try_numeric = set(obj_like) - categorical_keep - ({target} if target else set())
    for c in try_numeric:
        # coerce only if >90% of non-null values look numeric
        s = df[c].dropna()
        looks_num = s.str.fullmatch(r"[+-]?\d+(\.\d+)?").mean() if len(s) else 0.0
        if looks_num >= 0.9:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # lock known categoricals
    for c in categorical_keep & set(df.columns):
        df[c] = df[c].astype("string")

    df = df.drop_duplicates()
    return df

target = "Survived"
df_train_clean = clean_frame(df_train, target=target)
df_test_clean  = clean_frame(df_test,  target=target) if df_test is not None else None

## Persist cleaned data

In [8]:
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
train_out = os.path.join(CLEAN_DIR, f"train_clean_{ts}.csv")
df_train_clean.to_csv(train_out, index=False)

if df_test_clean is not None:
    test_out = os.path.join(CLEAN_DIR, f"test_clean_{ts}.csv")
    df_test_clean.to_csv(test_out, index=False)

## Save simple schema

In [9]:
schema_json = os.path.join(CLEAN_DIR, f"schema_{ts}.json")
schema = {c: str(t) for c, t in df_train_clean.dtypes.items()}
with open(schema_json, "w", encoding="utf-8") as f:
    json.dump({"columns": schema, "rows": len(df_train_clean)}, f, indent=2, ensure_ascii=False)

print("\nSaved:")
print(train_out)
if df_test_clean is not None: print(test_out)
print(schema_json)


Saved:
../data/clean\train_clean_20251019-211325.csv
../data/clean\test_clean_20251019-211325.csv
../data/clean\schema_20251019-211325.json
