In [2]:
import pandas as pd
from sklearn.utils import Bunch
import glob
import warnings

train_csv_files = glob.glob("train*.csv")
test_csv_files = glob.glob("test*.csv")

train_dataframes = {
    fname.replace("train", "", 1).replace(".csv", ""): pd.read_csv(fname)
    for fname in train_csv_files
}
test_dataframes = {
    fname.replace("test", "", 1).replace(".csv", ""): pd.read_csv(fname)
    for fname in test_csv_files
}

df_perf = Bunch(
    features=train_dataframes["perf"].drop(columns=["good_bad_flag"]),
    target=train_dataframes["perf"]["good_bad_flag"].map({"Bad": 0, "Good": 1}),
    test=test_dataframes["perf"],
)

df_prevloans = Bunch(
    features=train_dataframes["prevloans"],
    test=test_dataframes["prevloans"],
)

df_demographics = Bunch(
    features=train_dataframes["demographics"],
    test=test_dataframes["demographics"],
)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from datetime import datetime

# 1. Join demographics data

df_perf_combined_features = df_perf.features.merge(
    df_demographics.features.drop_duplicates(),
    how="left",
    on="customerid",
)
df_perf_combined_test = df_perf.test.merge(
    df_demographics.test.drop_duplicates(),
    how="left",
    on="customerid",
)

# 2. Join previous loans data

df_prevloans_agg_features = (
    df_prevloans.features.groupby("customerid")
    .agg(
        prev_loan_count=("systemloanid", "nunique"),
        prev_loan_amt_mean=("loanamount", "mean"),
        prev_term_mean=("termdays", "mean"),
        prev_delay_mean=(
            "firstrepaiddate",
            lambda d: (
                (
                    pd.to_datetime(d)
                    - pd.to_datetime(df_prevloans.features["creationdate"])
                ).dt.days
            ).mean(),
        ),
        prev_early_payment_mean=(
            "firstduedate",
            lambda d: (
                (
                    pd.to_datetime(d)
                    - pd.to_datetime(df_prevloans.features["firstrepaiddate"])
                ).dt.days
            ).mean(),
        ),
    )
    .reset_index()
)

df_perf_combined_features = df_perf_combined_features.merge(
    df_prevloans_agg_features, how="left", on="customerid"
)

df_prevloans_agg_test = (
    df_prevloans.test.groupby("customerid")
    .agg(
        prev_loan_count=("systemloanid", "nunique"),
        prev_loan_amt_mean=("loanamount", "mean"),
        prev_term_mean=("termdays", "mean"),
        prev_delay_mean=(
            "firstrepaiddate",
            lambda d: (
                (
                    pd.to_datetime(d)
                    - pd.to_datetime(df_prevloans.test["creationdate"])
                ).dt.days
            ).mean(),
        ),
        prev_early_payment_mean=(
            "firstduedate",
            lambda d: (
                (
                    pd.to_datetime(d)
                    - pd.to_datetime(df_prevloans.test["firstrepaiddate"])
                ).dt.days
            ).mean(),
        ),
    )
    .reset_index()
)

df_perf_combined_test = df_perf_combined_test.merge(
    df_prevloans_agg_test, how="left", on="customerid"
)

  pd.to_datetime(d)
  pd.to_datetime(d)
  pd.to_datetime(d)
  pd.to_datetime(d)
  pd.to_datetime(d)


In [4]:
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier


class AgeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (
            (datetime.now() - pd.to_datetime(X.squeeze(), errors="coerce")).dt.days
            // 365
        ).to_frame()

    def get_feature_names_out(self, input_features=None):
        return ["age"]


class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.apply(pd.to_datetime, errors="coerce")
        features = []

        for col in X.columns:
            day = X[col].dt.dayofweek.add(1).fillna(0).astype(int)
            day.name = f"{col}_day"
            month = X[col].dt.month.fillna(0).astype(int)
            month.name = f"{col}_month"
            features.append(day)
            features.append(month)

        return pd.concat(features, axis=1)

    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            return None
        return [f"{col}_day" for col in input_features] + [
            f"{col}_month" for col in input_features
        ]


class ReferralTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.notna().astype(int)

    def get_feature_names_out(self, input_features=None):
        return ["referred"]


class ApprovalDelay(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        approval_date = pd.to_datetime(X["approveddate"], errors="coerce")
        creation_date = pd.to_datetime(X["creationdate"], errors="coerce")
        delay = ((approval_date - creation_date).dt.total_seconds() / 60).to_frame(
            name="approval_delay"
        )
        return delay

    def get_feature_names_out(self, input_features=None):
        return ["approval_delay"]


cols = [
    (
        "scaler",
        StandardScaler(),
        [
            "loanamount",
            "totaldue",
            "longitude_gps",
            "latitude_gps",
            "termdays",
        ],
    ),
    (
        "prevloans",
        StandardScaler(),
        [
            "prev_loan_count",
            "prev_loan_amt_mean",
            # "prev_term_mean",
            "prev_delay_mean",
            "prev_early_payment_mean",
        ],
    ),
    ("age", AgeTransformer(), ["birthdate"]),
    # ("convert_dates", DateTransformer(), ["approveddate"]),
    # ("referral", ReferralTransformer(), ["referral"]),
    (
        "categorical",
        TargetEncoder(),
        [
            "employment_status_clients",
            # "level_of_education_clients",
            "bank_name_clients",
        ],
    ),
]

In [5]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC


import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

# Example usage:
pipe = Pipeline(
    [
        ("transform_columns", ColumnTransformer(cols)),
        ("impute", SimpleImputer(strategy="mean")),
        (
            "clf",
            CatBoostClassifier(
                verbose=0,
                random_strength=1.0,
                random_state=42,
                learning_rate=0.1,
                l2_leaf_reg=1,
                iterations=100,
                grow_policy="Depthwise",
                depth=2,
            ),
        ),
    ]
)
pipe.fit(df_perf_combined_features, df_perf.target)

0,1,2
,steps,"[('transform_columns', ...), ('impute', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scaler', ...), ('prevloans', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [6]:
param_grid = [
    {
        "clf": [CatBoostClassifier()],
        "clf__depth": [2, 3, 4, 6, 8, 10],
        "clf__learning_rate": [0.01, 0.03, 0.1, 0.2, 0.4],
        "clf__iterations": [50, 100, 300, 500],
        "clf__verbose": [0],  # Suppress CatBoost output during training
        "clf__random_state": [42],
        # "clf__class_weights": [{0: 1.0, 1: 0.5}, {0: 1.0, 1: 0.75}, {0: 1.0, 1: 1.0}],
        "clf__auto_class_weights": ["Balanced", "SqrtBalanced", "None"],
        "clf__random_strength": [0.5, 1, 5, 10],
        "clf__l2_leaf_reg": [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
        "clf__grow_policy": ["SymmetricTree", "Depthwise"],
        "clf__bagging_temperature": [0.1, 0.5, 1.0, 2.0],
        "clf__border_count": [32, 64, 128],
        "clf__rsm": [0.4, 0.6, 0.8, 1.0],
    },
    {
        "clf": [KNeighborsClassifier()],
        "clf__n_neighbors": [3, 5, 7, 9],
        "clf__weights": ["uniform", "distance"],
        "clf__metric": ["euclidean", "manhattan", "minkowski"],
    },
    {
        "clf": [HistGradientBoostingClassifier()],
        "clf__max_iter": [100, 200, 300],
        "clf__learning_rate": [0.001, 0.01, 0.1],
        "clf__max_depth": [3, 5, 7, 9, 11, 13, 15],
    },
    {
        "clf": [RandomForestClassifier()],
        "clf__n_estimators": [50, 100],
        "clf__class_weight": ["balanced"],
        "clf__max_depth": [None, 5, 10, 15],
        "clf__min_samples_split": [2, 5],
    },
    {
        "clf": [SVC()],
        "clf__C": [0.1, 1],
        "clf__kernel": ["linear", "rbf"],
        "clf__class_weight": ["balanced"],
    },
    {
        "clf": [DecisionTreeClassifier()],
        "clf__max_depth": [None, 5, 10],
        "clf__min_samples_split": [2, 5],
        "clf__class_weight": ["balanced"],
    },
    {
        "clf": [XGBClassifier()],
        "clf__use_label_encoder": [False],
        "clf__eval_metric": ["logloss"],
        "clf__n_estimators": [100, 200],
        "clf__learning_rate": [0.01, 0.1],
    },
]

# grid = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
grid = RandomizedSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
grid.fit(df_perf_combined_features, df_perf.target)

0,1,2
,estimator,Pipeline(step...x133556cf0>)])
,param_distributions,"[{'clf': [<catboost.cor...t 0x133ab4b90>], 'clf__auto_class_weights': ['Balanced', 'SqrtBalanced', ...], 'clf__bagging_temperature': [0.1, 0.5, ...], 'clf__border_count': [32, 64, ...], ...}, {'clf': [KNeighborsClassifier()], 'clf__metric': ['euclidean', 'manhattan', ...], 'clf__n_neighbors': [3, 5, ...], 'clf__weights': ['uniform', 'distance']}, ...]"
,n_iter,10
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,transformers,"[('scaler', ...), ('prevloans', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [7]:
for mean, std, params in zip(
    grid.cv_results_["mean_test_score"],
    grid.cv_results_["std_test_score"],
    grid.cv_results_["params"],
):
    model = params.get("clf").__class__.__name__
    params = {k.replace("clf__", ""): v for k, v in params.items()}
    print(f"Mean: {mean:.3f}, Std: {std:.3f}, Model: {model}, Params: {params}")

print("Best parameters found: ", grid.best_params_)
print("Best cross-validation score: ", grid.best_score_)

col_transform = grid.best_estimator_.named_steps["transform_columns"]
best_model = grid.best_estimator_.named_steps["clf"]

print("Best model:", best_model.__class__.__name__)

if hasattr(best_model, "get_feature_importance"):
    feature_importances = zip(
        col_transform.get_feature_names_out(),
        best_model.get_feature_importance(),
    )
    print("Feature Importances")
    # Convert feature importances to a dictionary for better readability
    feature_importances_dict = dict(feature_importances)
    print(list(enumerate(feature_importances_dict.items())))
    # Sort feature importances by value in descending order
    sorted_feature_importances = sorted(
        feature_importances_dict.items(), key=lambda x: x[1], reverse=True
    )

    # Convert to a DataFrame for CSV export
    feature_importances_df = pd.DataFrame(
        sorted_feature_importances, columns=["Feature", "Importance"]
    )

    # Save to CSV
    feature_importances_df.to_csv("feature_importances.csv", index=False)

    # Print the DataFrame
    print(feature_importances_df)
else:
    print("The best model does not support feature importances.")

Mean: 0.790, Std: 0.002, Model: CatBoostClassifier, Params: {'verbose': 0, 'rsm': 0.6, 'random_strength': 10, 'random_state': 42, 'learning_rate': 0.4, 'l2_leaf_reg': 17, 'iterations': 50, 'grow_policy': 'Depthwise', 'depth': 10, 'border_count': 64, 'bagging_temperature': 0.1, 'auto_class_weights': 'None', 'clf': <catboost.core.CatBoostClassifier object at 0x133ab4b90>}
Mean: 0.765, Std: 0.008, Model: CatBoostClassifier, Params: {'verbose': 0, 'rsm': 1.0, 'random_strength': 10, 'random_state': 42, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 300, 'grow_policy': 'Depthwise', 'depth': 8, 'border_count': 32, 'bagging_temperature': 2.0, 'auto_class_weights': 'SqrtBalanced', 'clf': <catboost.core.CatBoostClassifier object at 0x133ab4b90>}
Mean: 0.772, Std: 0.004, Model: CatBoostClassifier, Params: {'verbose': 0, 'rsm': 1.0, 'random_strength': 1, 'random_state': 42, 'learning_rate': 0.03, 'l2_leaf_reg': 13, 'iterations': 500, 'grow_policy': 'Depthwise', 'depth': 6, 'border_count': 3

In [8]:
# Make predictions on the test set
test_preds = grid.predict(df_perf_combined_test)
# test_preds = pipe.predict(df_perf_combined_test)

# Prepare the output DataFrame
output_df = pd.DataFrame(
    {"customerid": df_perf.test["customerid"], "good_bad_flag": test_preds}
)

print("\nNumber of predicted 1s:", (output_df["good_bad_flag"] == 1).sum())
print("Number of predicted 0s:", (output_df["good_bad_flag"] == 0).sum())

# Save to CSV
output_df.to_csv("predictions.csv", index=False)


Number of predicted 1s: 1319
Number of predicted 0s: 131


In [22]:
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


# 1. Transform the data
X_transformed = col_transform.transform(df_perf_combined_features)

# 2. Get feature names
feature_names = col_transform.get_feature_names_out()

# 3. Create CatBoost Pool
catboost_pool = Pool(
    X_transformed,
    df_perf.target,
    feature_names=list(feature_names),
)

model = CatBoostClassifier(max_depth=3, verbose=False, iterations=10).fit(catboost_pool)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, df_perf.target, test_size=0.2, random_state=42
)

# Train the model on the training set
model.fit(X_train, y_train)

# Validate the model on the test set
y_pred = model.predict(X_test)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# model.plot_tree(tree_idx=0, pool=catboost_pool)

Confusion Matrix:
[[ 40 166]
 [ 40 628]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.19      0.28       206
           1       0.79      0.94      0.86       668

    accuracy                           0.76       874
   macro avg       0.65      0.57      0.57       874
weighted avg       0.72      0.76      0.72       874



In [29]:
# Convert X_test to a DataFrame for easier indexing
X_test_df = pd.DataFrame(X_test, columns=feature_names)

# Get indices of correct and incorrect predictions
correct_indices = np.where(y_pred == y_test)[0]
incorrect_indices = np.where(y_pred != y_test)[0]

# Select three records from each
correct_records = X_test_df.iloc[correct_indices[:3]].copy()
incorrect_records = X_test_df.iloc[incorrect_indices[:3]].copy()

# Add the predicted value as a new column
correct_records["predicted"] = y_pred[correct_indices[:3]]
incorrect_records["predicted"] = y_pred[incorrect_indices[:3]]

correct_records.to_csv("correct_records.csv", index=False)
incorrect_records.to_csv("incorrect_records.csv", index=False)

In [30]:
# Analyze incorrect predictions
print(
    "The incorrect predictions may have occurred due to several reasons:"
    "\n1. **Insufficient or Noisy Data**: Some features might lack sufficient information or contain noise, "
    "making it difficult for the model to learn meaningful patterns."
    "\n2. **Feature Importance**: Features with low importance, as seen in the feature importance analysis, "
    "may not contribute significantly to the model's decision-making process."
    "\n3. **Class Imbalance**: If the dataset is imbalanced, the model might struggle to correctly predict the minority class."
    "\n4. **Complex Patterns**: The relationships between features and the target variable might be too complex for the model to capture."
    "\n5. **Overfitting**: The model might have overfitted to the training data, leading to poor generalization on unseen data."
    "\n6. **Hyperparameter Tuning**: Suboptimal hyperparameters could have limited the model's performance."
    "\nFurther investigation into the incorrectly predicted records and their feature values could help identify specific issues."
)

The incorrect predictions may have occurred due to several reasons:
1. **Insufficient or Noisy Data**: Some features might lack sufficient information or contain noise, making it difficult for the model to learn meaningful patterns.
2. **Feature Importance**: Features with low importance, as seen in the feature importance analysis, may not contribute significantly to the model's decision-making process.
3. **Class Imbalance**: If the dataset is imbalanced, the model might struggle to correctly predict the minority class.
4. **Complex Patterns**: The relationships between features and the target variable might be too complex for the model to capture.
5. **Overfitting**: The model might have overfitted to the training data, leading to poor generalization on unseen data.
6. **Hyperparameter Tuning**: Suboptimal hyperparameters could have limited the model's performance.
Further investigation into the incorrectly predicted records and their feature values could help identify specific issue