In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import joblib
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer


DATA_PATH = "data"
VERBOSE = True 
R_S = 42
VAL_SIZE = 0.2
VIF_DELETE_THRESH=10
MISSING_THRESH=0.9

RESULTS_JSON = "models/results.json"
MODEL_PATH = "models/lgbm_num_cat_te_31.pkl"
MODEL_ID_FILE = "models/model_id.txt"

train_transaction = pd.read_csv(f"{DATA_PATH}/train_transaction.csv")
train_identity = pd.read_csv(f"{DATA_PATH}/train_identity.csv")
print(train_transaction.shape)
print(train_identity.shape)


# Merge the transaction, identity tables
# left join because: "Not all transactions have corresponding identity information."

train_df = train_transaction.merge(
    train_identity,
    on="TransactionID",
    how="left"
)

y = train_df["isFraud"]
X = train_df.drop(columns=["isFraud"])



X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=VAL_SIZE,
    random_state=R_S,
    stratify=y
)


(590540, 394)
(144233, 41)


In [None]:
MODEL_PATH = "models/lgbm_num_cat_te_31.pkl"

print("1. Preprocessing the train set...")

X_train_num = X_train.select_dtypes(include=["number"]).copy()
if "TransactionID" in X_train_num.columns:
    X_train_num.drop(columns=["TransactionID"], inplace=True)

missing_rate = X_train_num.isna().mean()
keep_cols = missing_rate[missing_rate <= MISSING_THRESH].index.tolist()
X_train_num_filtered = X_train_num[keep_cols]

X_train_cat = X_train.select_dtypes(include=["object", "category"]).copy()
cat_cols = X_train_cat.columns.tolist()
X_train_cat = X_train_cat.fillna("__MISSING__")

te = TargetEncoder(cols=cat_cols)
X_train_cat_encoded = te.fit_transform(X_train_cat, y_train)

X_train_final = X_train_num_filtered.join(X_train_cat_encoded)

print(f"   Test data loaded. Shape: {X_train_final.shape}")

print("2. Loading the model...")
old_pipeline = joblib.load(MODEL_PATH)
trained_model = old_pipeline.named_steps['clf']
new_imputer = SimpleImputer(strategy="median")
new_imputer.fit(X_train_final)

print("3. Preprocessing the test set..")

test_transaction = pd.read_csv(f"{DATA_PATH}/test_transaction.csv")
test_identity = pd.read_csv(f"{DATA_PATH}/test_identity.csv")

test_identity.columns = [c.replace('-', '_') for c in test_identity.columns]

X_test = test_transaction.merge(test_identity, on="TransactionID", how="left")
test_ids = X_test["TransactionID"]

X_test_num = X_test.select_dtypes(include=["number"])
for col in keep_cols:
    if col not in X_test_num.columns:
        X_test_num[col] = np.nan
X_test_num = X_test_num[keep_cols].copy()

# arget Encoding
X_test_cat = X_test.select_dtypes(include=["object", "category"]).copy()
for col in cat_cols:
    if col not in X_test_cat.columns:
        X_test_cat[col] = "__MISSING__"
X_test_cat = X_test_cat[cat_cols].fillna("__MISSING__")
X_test_cat_encoded = te.transform(X_test_cat)

X_test_final = X_test_num.join(X_test_cat_encoded)
print(f"   Dane testowe gotowe. Shape: {X_test_final.shape}")


print("4. Generating predictions...")
X_test_imputed = new_imputer.transform(X_test_final)

predictions = trained_model.predict_proba(X_test_imputed)[:, 1]

submission = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': predictions
})

submission.to_csv('submission.csv', index=False)
print("Success! The file  'submission.csv' was saved.")

1. Preprocessing the train set...
   Test data loaded. Shape: (472432, 422)
2. Loading the model...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
