# Import/Setup

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import os
from datetime import datetime
import gc
import warnings
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME'] = 'g-kitiashvili'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '1c2227158cc19daf66bb3b241116a8e8c5f1cd20'
mlflow.set_experiment("Model_Inference")


<Experiment: artifact_location='mlflow-artifacts:/3c7f6a385b7b4a07b71d6770ad1431e3', creation_time=1745410111300, experiment_id='4', last_update_time=1745410111300, lifecycle_stage='active', name='Model_Inference', tags={}>

# Load test data

In [2]:
print("Loading test data...")
test_transaction = pd.read_csv('./data/test_transaction.csv')
test_identity = pd.read_csv('./data/test_identity.csv')
print(f"Test transaction shape: {test_transaction.shape}")
print(f"Test identity shape: {test_identity.shape}")

print("Merging test data...")
test = test_transaction.merge(test_identity, on='TransactionID', how='left')
print(f"Merged test shape: {test.shape}")

test_transaction_id = test['TransactionID'].copy()

del test_transaction, test_identity
gc.collect()

Loading test data...
Test transaction shape: (506691, 393)
Test identity shape: (141907, 41)
Merging test data...
Merged test shape: (506691, 433)


144

# Load best model

In [3]:
with mlflow.start_run(run_name="Load_Best_Model"):
    best_model_name = "XGBoost_Model"
    print(f"Loading model: {best_model_name}")
    model = mlflow.sklearn.load_model(f"models:/{best_model_name}/latest")
    mlflow.log_param("model_name", best_model_name)
    print("Model loaded successfully.")

Loading model: XGBoost_Model


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00,  7.55it/s]


Model loaded successfully.
🏃 View run Load_Best_Model at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/4/runs/dde471d397784e93bacebf5c00b50a42
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/4


# Preprocess & Predict

In [4]:
with mlflow.start_run(run_name="Generate_Predictions"):
    print("Preprocessing test data...")
    test_encoded = test.copy()

    obj_cols = test_encoded.select_dtypes(include=['object']).columns.tolist()
    print(f"Found {len(obj_cols)} object columns.")
    for col in obj_cols:
        le = LabelEncoder()
        test_encoded[col] = test_encoded[col].fillna('missing').astype(str)
        test_encoded[col] = le.fit_transform(test_encoded[col])

    # Convert any remaining object columns to numeric
    for col in test_encoded.columns:
        if test_encoded[col].dtype == 'object':
            test_encoded[col] = pd.to_numeric(test_encoded[col], errors='coerce').fillna(-999)

    print("Encoding complete.")

    # Align features with training
    booster = model.get_booster()
    train_feats = booster.feature_names

    extras = [c for c in test_encoded.columns if c not in train_feats]
    if extras:
        print(f"Dropping {len(extras)} extra columns.")
        test_encoded.drop(columns=extras, inplace=True)

    # Add missing columns
    for c in train_feats:
        if c not in test_encoded.columns:
            test_encoded[c] = 0

    test_aligned = test_encoded[train_feats]
    print("Feature alignment done.")

    # Generate predictions
    test_probs = model.predict_proba(test_aligned)[:, 1]
    print("Predictions generated.")
    mlflow.log_param("num_test_rows", test_aligned.shape[0])

Preprocessing test data...
Found 31 object columns.
Encoding complete.
Dropping 94 extra columns.
Feature alignment done.
Predictions generated.
🏃 View run Generate_Predictions at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/4/runs/9bd00f1e8079424b94dc6d4c40db3fa2
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/4


# Build submission DataFrame and save

In [5]:
submission = pd.DataFrame({
    'TransactionID': test_transaction_id,
    'isFraud': test_probs
})
with mlflow.start_run(run_name="Save_Submission"):
    submission_file = f"{best_model_name}_submission_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
    submission.to_csv(submission_file, index=False)
    print(f"Submission saved to: {submission_file}")
    mlflow.log_artifact(submission_file)

    print(submission.head())
    print(f"Total predictions: {len(submission)}")
    print(f"Prediction range: [{submission['isFraud'].min():.6f}, {submission['isFraud'].max():.6f}]")
    print(f"Mean prediction: {submission['isFraud'].mean():.6f}")
    print(f">0.5 flag count: {(submission['isFraud']>0.5).sum()} ({(submission['isFraud']>0.5).mean()*100:.2f}%)")


Submission saved to: XGBoost_Model_submission_20250423_1705.csv
   TransactionID   isFraud
0        3663549  0.004157
1        3663550  0.002563
2        3663551  0.008957
3        3663552  0.005339
4        3663553  0.001406
Total predictions: 506691
Prediction range: [0.000009, 0.991089]
Mean prediction: 0.020070
>0.5 flag count: 3444 (0.68%)
🏃 View run Save_Submission at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/4/runs/8b267833826d42118d80c3073492301e
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/4
