<h1>Libraries</h1>

In [None]:
# ===================== LIBRARIES =====================
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# ===================== CONFIG =====================
RANDOM_STATE = 42
TARGET = "loan_paid_back"
DATA_PATH = "train.csv"

# ===================== LOAD DATA =====================
df = pd.read_csv("Loan_data")

print("Dataset shape:", df.shape)
print("Target distribution:\n", df[TARGET].value_counts())

# ===================== DROP ID & TARGET =====================
X = df.drop(columns=[TARGET, "id"], errors="ignore")
y = df[TARGET]

# ===================== COLUMN TYPES =====================
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# ===================== ENCODING =====================
X_encoded = X.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le

feature_columns = X_encoded.columns.tolist()

# ===================== SPLIT =====================
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

# ===================== SCALING =====================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ===================== MODELS =====================
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=1000,
        max_depth=12,
        min_samples_split=5,
        min_samples_leaf=3,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE
    ),

    "CatBoost": CatBoostClassifier(
        iterations=100,
        learning_rate=0.03,
        depth=7,
        loss_function="Logloss",
        eval_metric="AUC",
        auto_class_weights="Balanced",
        l2_leaf_reg=6,
        random_seed=RANDOM_STATE,
        verbose=0
    )
}

# ===================== TRAIN & EVALUATE =====================
results = {}
trained_models = {}

for name, model in models.items():

    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
        probs = model.predict_proba(X_test_scaled)[:, 1]

    elif name == "Random Forest":
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1]

    else:  # CatBoost
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)

    results[name] = {"accuracy": acc, "auc": auc}
    trained_models[name] = model

    print(f"\n===== {name} =====")
    print(f"Accuracy: {acc:.4f}")
    print(f"AUC: {auc:.4f}")
    print(classification_report(y_test, preds))

# ===================== SELECT BEST MODEL =====================
best_model_name = max(results, key=lambda k: results[k]["auc"])
best_model = trained_models[best_model_name]

print("\nüèÜ BEST MODEL SELECTED:", best_model_name)
print("Best AUC:", results[best_model_name]["auc"])

# ===================== SAVE ARTIFACTS =====================
joblib.dump(best_model, "best_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(feature_columns, "feature_columns.pkl")

print("\n‚úÖ Files saved successfully:")
print(" - best_model.pkl")
print(" - scaler.pkl")
print(" - label_encoders.pkl")
print(" - feature_columns.pkl")


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'