# 02: Data Preprocessing  
**Objective:** split data, extract text features, encode categorical, and export final arrays.


In [None]:
import pandas as pd
from recruitment_fairness.data.loader import load_trials
from recruitment_fairness.data.preprocess import (
    split_trials,
    vectorize_conditions,
    encode_phases,
    save_npz
)


In [None]:
df = load_trials()
print("Total trials:", len(df))
df.head()


In [None]:
train, val, test = split_trials(df, test_size=0.2, val_size=0.1)
print(f"Splits — train: {len(train)}, val: {len(val)}, test: {len(test)}")


In [None]:
X_tr, X_va, X_te, tfidf = vectorize_conditions(
    train["Condition"], val["Condition"], test["Condition"]
)
print("TF-IDF feature count:", X_tr.shape[1])


In [None]:
Xp_tr, Xp_va, Xp_te, phase_enc = encode_phases(
    train["Phase"], val["Phase"], test["Phase"]
)
print("Phase one-hot dims:", Xp_tr.shape[1])


In [None]:
Xp_tr, Xp_va, Xp_te, phase_enc = encode_phases(
    train["Phase"], val["Phase"], test["Phase"]
)
print("Phase one-hot dims:", Xp_tr.shape[1])


In [None]:
# e.g. label‐encode the target if needed
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_tr = le.fit_transform(train["OverallStatus"])
y_va = le.transform(val["OverallStatus"])
y_te = le.transform(test["OverallStatus"])


In [None]:
save_npz(
    out_dir="data/processed",
    X_dict={"train": X_tr, "val": X_va, "test": X_te},
    y_dict={"train": y_tr, "val": y_va, "test": y_te},
)
print("Saved to data/processed/*.npz")


In [None]:
import numpy as np
loader = lambda split: np.load(f"data/processed/{split}.npz")["X"]
print("Loaded shape:", loader("train").shape)
