# Problem 
Identify potentailly fraudulent transactions from the given dataset


# Assumptions - Made by Maciej Król
* Model shouldn't be overfitted to given customers and counterparties
* Make model as robust as it can be for new customers and transactions 
* Make best score on cross validation 4:1 <br>
* 1 - fraudulent, 0 - normal
* Business case is probably to catch every single fraudulent transaction so 
business metric should be to maximise a recall and have precision/bal_acc in mind.
* In real envirionment scenario there should be probably another model or rules which can recognize client connections and institutions associated with fraud possibility.

# Thoughts - Made by Maciej Król
* Id's of customer could have some kind of relation with date, or age of clients. Without any additional information can't use that. I make an assumption that there is no any signifacnt information in customer id besides first letter 
* Transaction amount is probably not normalised. It should be brought to one currency e.g. USD.
* TODO: For better transaction normalisation, maybe take avg salary in given currency/USD, hmmm <br> UPD: Almost no correlation with fraud flag, can be waste of time
* We have only 91 fraudulent transactions (probably need some kind of oversampling, remember of class_weights)  

# Imports/Options

In [None]:
!pip install -q clearml
!pip install -q binclass-tools
!pip install -q catboost

# Env settings to retrieve artifacts from clear_ml server
# Clear ml - relatively new tool for MLOps 
# We are operating on Task objects which have connection to our artifacts database
%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=QZS6KLLLR4UR43EFQONG
%env CLEARML_API_SECRET_KEY=AHdLlidwgSXAjYm8ISxZd0b5mwkcUjMTjd2wARnFBtMqfO0unT

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25henv: CLEARML_WEB_HOST=https://app.clear.ml
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml
env: CLEARML_API_ACCESS_KEY=QZS6KLLLR4UR43EFQONG
env: CLEARML_API_SECRET_KEY=AHdLlidwgSXAjYm8ISxZd0b5mwkcUjMTjd2wARnFBtMqfO0unT


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
import pandas as pd
import numpy as np 

from typing import List, Dict

import math

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

from datetime import datetime

from scipy.stats import pointbiserialr, chi2_contingency

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

plt.rcParams['figure.figsize'] = [20, 12]
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

plt.rcParams['axes.titlesize'] = 'x-large'

pd.set_option('max_columns', 50)

OptionError: ignored

In [None]:
## Important for data retrieval
from clearml import Task
# task = Task.init(project_name='Silent_Eight_Project', task_name='task_3')
task = Task.get_task(task_id='b2a51c5fb8fb4b8490da923feed5df23')

# EDA + data preparation

In [None]:
# Get local copy of dataset
df_path = task.artifacts['transactions'].get_local_copy()
df = pd.read_csv(df_path, parse_dates=['timestamp'])
df = df.sample(frac = 1)

In [None]:
def print_section(text: str):
  final_text = "| "+text + " |"
  print('\n')
  print('*' * len(final_text))
  print(final_text) 
  print('*' * len(final_text))

In [None]:
print_section('Types in DataFrame')
df.dtypes

print_section('Duplicated Rows')
df[df.duplicated()]

print_section('Number of nulls')
df.isna().sum()

In [None]:
# Fill counterparty_country by same records of counterparty
# This nulls could have some significant information, let's save them for later
df = df.sort_values(by=['counterparty', 'counterparty_country'])
df.loc[:, 'counterparty_filled'] = df[['counterparty', 'counterparty_country']].groupby('counterparty').fillna(method='ffill')

df.loc[:, 'country_null'] = 0
df.loc[(df['counterparty_country'].isna()) & (~df['counterparty_filled'].isna()), 'country_null'] = 1

df = df.drop(columns=['counterparty_country'])

# All of this counterparties occurs only once
df[df['counterparty_filled'].isna()]
# df[df['counterparty']==78153912424955]

#Fill them with Unknown label
df.loc[:, 'counterparty_filled'] = df['counterparty_filled'].fillna('Unknown')

In [None]:
print_section('Unique occurences per column')
df.nunique()

In [None]:
# We got 4 types of customers (probably) :D
print_section('Unique values per first letters of customers')
print(df['customer'].str[:3].nunique())
print(df['customer'].str[:2].nunique())
print(df['customer'].str[:1].nunique())

print_section('Count for types of customers')
df['customer'].str[:1].value_counts()

#Extract type for another information
df.loc[:, 'customer_type'] = df['customer'].str[:1]

In [None]:
print_section('Count for transaction types')
df['type'].value_counts()

print_section('Count for customer countries')
df['customer_country'].value_counts()

# No currencies of high risk countries https://ec.europa.eu/transparency/documents-register/detail?ref=C(2022)9649&lang=en
print_section('Count for currencies')
df['ccy'].value_counts()

# No countries of high risk countries https://ec.europa.eu/transparency/documents-register/detail?ref=C(2022)9649&lang=en
print_section('Count for counterparty countries')
df['counterparty_filled'].value_counts()

In [None]:
# Merge all USA tags
us_replace_dict = {
    'USA': 'US',
    'United States': 'US'
}
df['counterparty_filled'] = df['counterparty_filled'].replace(us_replace_dict)
print_section('Count for counterparty countries filled with replace')
df['counterparty_filled'].value_counts()

In [None]:
# Check occurences of each customer
# Distribution is quite normal, no outlying clients
print_section('Count for customers')
df['customer'].value_counts()
customer_frequncies = df['customer'].value_counts()
plt.hist(x=customer_frequncies.values, bins=30, )
plt.title('Distribution of Customers')
plt.show()

In [None]:
# Check occurences of each counterparty
# First look probably no outliers
print_section('Count for counterparties')
print(df['counterparty'].value_counts())
customer_frequncy = df['counterparty'].value_counts().values
plt.hist(x=customer_frequncy, bins=30)
plt.title('Distribution of counterparties')
plt.show()

In [None]:
df['amount'].astype(float)

In [None]:
# I assume that I can just delete [',', '$', '£'] 
df[~df['amount'].str.match(r'^[0-9]*\.?[0-9]*$')]

In [None]:
df['amount'] = df['amount'].str.replace(',', '', regex=False)
df['amount'] = df['amount'].str.replace('$', '', regex=False)
df['amount'] = df['amount'].str.replace('£', '', regex=False)
print_section('List with broken records after replacing')
df[~df['amount'].str.match(r'^[0-9]*\.?[0-9]*$')]

In [None]:
df['amount'] = df['amount'].astype(float)

In [None]:
print_section('Count for years')
df['timestamp'].dt.year.value_counts()

print_section('Count for months')
df['timestamp'].dt.month.value_counts()

print_section('Count for days')
df['timestamp'].dt.day.value_counts()

print_section('Count for weekdays')
df['timestamp'].dt.weekday.value_counts()

print_section('Count for hours')
df['timestamp'].dt.hour.value_counts()

In [None]:
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['weekday'] = df['timestamp'].dt.weekday
df['weekday'] += 1
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour

df['timestamp'] = df['timestamp'].dt.date

print(df['timestamp'].min())
print(df['timestamp'].max())

In [None]:
# Here is the section with normalizing amount whhich was sent in transaction every amount is normalised by USD value per given date

# Prepocess rate dataframe
rates_path = task.artifacts['exchange_rates'].get_local_copy()
rates = pd.read_csv(rates_path, on_bad_lines='skip')
rates.columns = rates.loc[1]
rates = rates.drop(index=[0,1,2,3,4,5,6,7])
rates.columns = [col[:3] for col in rates.columns]
rates = rates.rename(columns={"Cur": "timestamp"})
rates = rates.ffill()

# Get only interesting timeframe
rates = rates[(rates['timestamp'] >= '2021-01-01' ) & (rates['timestamp'] <= '2022-01-01')]
rates = rates.set_index('timestamp')

# Get only interesting currencies
currencies = ['CNY', 'SGD', 'USD', 'JPY', 'HKD', 'GBP', 'BRL', 'INR', 'EUR']
rates = rates[currencies]

# Get only one EUR
rates = rates.iloc[:, :9]

# Make currency conversion
def conversion_to_USD(row: pd.Series) -> float:
  timestamp = row['timestamp']
  amount = row['amount']
  ccy = row['ccy']

  rate = rates.loc[timestamp][ccy]
  return amount/rate

df['timestamp'] = df['timestamp'].astype(str)
df['amount_converted'] = df.apply(lambda x: conversion_to_USD(x), axis=1)

In [None]:
# Get numerical target
df['fraud_flag'] = df['fraud_flag'].replace('Y', 1)
df['fraud_flag'] = df['fraud_flag'].replace('N', 0)

# Round amounts of transactions
df['amount_converted'] = df['amount_converted'].round(2)
df['amount'] = df['amount'].round(2)

# Get dummies for better modelling
df = pd.get_dummies(df, columns=['customer_country', 'type', 'ccy', 'counterparty_filled', 'customer_type'], drop_first=True)
df = df.drop(columns=['customer', 'counterparty', 'timestamp', 'year'])
df = df.reset_index(drop=True)

In [None]:
# Implement sin/cos functions for time
# very nice method of transforming time data 
# In this scale, 23:00 and 01:00 is much closer to each other
def transform_time(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
  for col in cols:
    df[f'{col}_norm'] = 2 * math.pi * df[col] / df[col].max()
    df[f"cos_{col}"] = np.cos(df[f"{col}_norm"])
    df[f"sin_{col}"] = np.sin(df[f"{col}_norm"])

    df = df.drop(columns=f'{col}_norm')
  return df

df = transform_time(df, cols=['month', 'weekday', 'day', 'hour'])
df = df.drop(columns = ['month', 'weekday', 'day', 'hour'])

# task.upload_artifact('full_modelling_data', df)

# Corr + some more exploration

In [None]:
task = Task.get_task(task_id='b2a51c5fb8fb4b8490da923feed5df23')
df = task.artifacts['full_modelling_data'].get()

In [None]:
# Nice, not too big correlation between variables, 
# Bad, no correlations with target variable :(
corr = df.corr()
f, ax = plt.subplots(figsize=(20, 16))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap, fmt='.1f')
plt.title('Correlation heatmap pearson')
plt.show()

In [None]:
corr = df.corr('kendall')
f, ax = plt.subplots(figsize=(20, 16))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap, fmt='.1f')
plt.title('Correlation heatmap kendall')
plt.show()

In [None]:
# proper corr for only target_variable
# Bad scores :(
cont_cols = ['amount', 'amount_converted', 'cos_month', 'sin_month', 'cos_weekday',
       'sin_weekday', 'cos_day', 'sin_day', 'cos_hour', 'sin_hour']

print_section('Biserial Correlations')
for col in cont_cols:
  print(col)
  print(pointbiserialr(df['fraud_flag'], df[col]))
  print()

In [None]:
# Check binary corr
# There are some significant relations 
def check_binary_cor(col: str):
  crosstab = pd.crosstab(index=df['fraud_flag'], columns=df[col])

  print(col)
  print('p-val:', chi2_contingency(crosstab)[1])
  print()


bin_cols = ['country_null', 'customer_country_UK', 'customer_country_US', 'type_DIVIDEND',
       'type_INTEREST', 'type_INVESTMENT', 'type_OTHER', 'type_PAYMENT',
       'type_TRANSFER', 'type_TT', 'ccy_CNY', 'ccy_EUR', 'ccy_GBP', 'ccy_HKD',
       'ccy_INR', 'ccy_JPY', 'ccy_SGD', 'ccy_USD', 'counterparty_filled_CN',
       'counterparty_filled_DE', 'counterparty_filled_FR',
       'counterparty_filled_HK', 'counterparty_filled_IN',
       'counterparty_filled_JP', 'counterparty_filled_SG',
       'counterparty_filled_UK', 'counterparty_filled_US', 'customer_type_K', 'customer_type_P',
       'customer_type_R']

# p=value < 0.05 there is significant relation between two binary variables
print_section('p-values for correlation')
for col in bin_cols:
  check_binary_cor(col)

In [None]:
cols = ['type_DIVIDEND', 'type_INTEREST', 'type_INVESTMENT', 'type_PAYMENT', 
        'counterparty_filled_CN', 'counterparty_filled_FR', 'counterparty_filled_HK',
        'counterparty_filled_JP', 'customer_type_K', 'customer_type_P', 'customer_type_R']
def print_crosstab(cols: List[str]):
  for col in cols:
    crosstab = pd.crosstab(index=df['fraud_flag'], columns=df[col])
    print(crosstab)
    print()
print_section('Cross tables for binary, correlated variables')
print_crosstab(cols)

# Baseline

In [None]:
import bctools as bc
from tqdm import tqdm

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight

from typing import Tuple

from clearml import OutputModel

In [None]:
# first make a try on only binary features which have some kind of relation 
X = df[['type_DIVIDEND', 'type_INTEREST', 'type_INVESTMENT', 'type_PAYMENT', 
        'counterparty_filled_CN', 'counterparty_filled_FR', 'counterparty_filled_HK',
        'counterparty_filled_JP', 'customer_type_K', 'customer_type_P', 'customer_type_R']]
y = df['fraud_flag']


# For better score evaluation just hit this 5/10 or even more times
def build_model_on_binaries(model, X_train: pd.DataFrame, y_train: pd.Series, 
                            threshold: float = None, log: bool = False) -> Tuple[float, float]:

  pipeline = make_pipeline(model)

  k_fold = StratifiedKFold(n_splits=5, shuffle=True)
  folds = k_fold.split(X_train, y_train)
  train_scores = []
  scores = []
  recalls = []

  for k, (train, test) in enumerate(folds, start=1):
      pipeline.fit(X_train.iloc[train, :], y_train.iloc[train])
      train_preds = pipeline.predict(X_train.iloc[train, :])
      train_score = balanced_accuracy_score(y_train.iloc[train].values, train_preds)
      train_scores.append(train_score)

      if threshold:
        y_preds = (pipeline.predict_proba(X_train.iloc[test, :])[:, 1] >= threshold).astype(bool)
      else:
        y_preds = pipeline.predict(X_train.iloc[test, :])
        
      cm = confusion_matrix(y_train.iloc[test].values, y_preds)

      score = balanced_accuracy_score(y_train.iloc[test].values, y_preds)
      scores.append(score)

      recall = cm[1][1]/(cm[1][1]+cm[1][0])
      recalls.append(recall)
      precision = cm[1][1]/(cm[0][1]+cm[1][1])

      if log:
        print(f'Fold: {k}, train_balanced_accuracy: {train_score:.3f},  balanced_accuracy: {score:.3f}, ' +
              f'precision: {precision:.3f}, recall: {recall:.3f}')
        print(cm)
        print()

        y_preds_proba = pipeline.predict_proba(X_train.iloc[test, :])[:,1]
        bc.curve_ROC_plot(true_y=y_train.iloc[test].values, predicted_proba=y_preds_proba)
        print()

  pipeline.mean_train_score = np.mean(train_scores)
  pipeline.mean_score = np.mean(scores)
  pipeline.mean_recall = np.mean(recalls)

  if log:
    print('\n\nbalanced_accuracy: %.3f +/- %.3f' %(pipeline.mean_score, np.std(scores)))
    print('recall: %.3f +/- %.3f' %(pipeline.mean_recall, np.std(recalls)))

  return pipeline.mean_train_score, pipeline.mean_score, pipeline.mean_recall

In [None]:
# Setting threshold to around 0.4 quite obviously boost our scores
# Recall is around 0.8, with balanced accuracy 0.77 we should try to improve this with some better models
mean_train_score, mean_score, mean_recall = build_model_on_binaries(DecisionTreeClassifier(max_depth=10, class_weight='balanced'), X, y, threshold=0.4, log=True)

In [None]:
train_scores = []
scores = []
recalls = []

# 30 times probably enough to get valid scores
# let's try to beat this score
for i in tqdm(range(30)):
  mean_train_score, mean_score, mean_recall = build_model_on_binaries(DecisionTreeClassifier(max_depth=10, class_weight='balanced'), X, y, threshold=0.4, log=False)
  scores.append(mean_score)
  recalls.append(mean_recall)
  train_scores.append(mean_train_score)
print()
print(f'Mean train balanced_acc: {np.mean(train_scores)}', f'Mean balanced_acc: {np.mean(scores)}', f'Mean recall: {np.mean(recalls)}')

# At 10 max depth, last improvement on train data.

# Challengers

## CatBoost

In [None]:
# Catboost is very nice model to make fast research about best features based on casual model parameters.
# Especially in this kind of classification cases.
# We don't have to scale features or even preprocess, catboost handling everything for us 

import catboost
from catboost import (
    CatBoostClassifier, 
    EShapCalcType, 
    EFeaturesSelectionAlgorithm,
    Pool,
    cv
)

from sklearn.utils import class_weight

In [None]:
X = df.drop(columns=['fraud_flag', 'amount'])
y = df['fraud_flag']

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights

In [None]:
def train_catboost_on_cv():
  pool = Pool(data=X, label=y)
  params = {
      'iterations': 100,
      'depth': 4,
      'class_weights': class_weights,
      'loss_function': 'Logloss',
      'custom_metric': ['Recall', 'BalancedAccuracy'],
      'verbose': False,
    }

  scores = cv(pool, params, fold_count=5, stratified=True, as_pandas=True)
  return scores

In [None]:
scores = train_catboost_on_cv()
# Somehow scores can be only seen well from logging files :(
# And they seem really bad for now
# scores

In [None]:
def select_features_with_kfold(X_train: pd.DataFrame, y_train: pd.Series, features_to_select: int, steps: int) -> Tuple[float, float]:

  k_fold = StratifiedKFold(n_splits=5, shuffle=True)
  folds = k_fold.split(X_train, y_train)
      
  algorithm = EFeaturesSelectionAlgorithm.RecursiveByShapValues
  summaries = []

  for k, (train, test) in enumerate(folds, start=1):
    train_pool = Pool(X_train.iloc[train, :], y_train.iloc[train])
    test_pool = Pool(X_train.iloc[test, :], y_train.iloc[test])

    params = {
      'iterations': 600,
      'auto_class_weights': 'Balanced',
      'loss_function': 'Logloss',
      'custom_metric': ['Recall', 'BalancedAccuracy'],
      'verbose': False,
    }

    model = CatBoostClassifier(**params)

    summary = model.select_features(
      train_pool,
      eval_set=test_pool,
      features_for_select=list(range(X_train.shape[1])),
      num_features_to_select = features_to_select,
      steps=steps,
      algorithm=algorithm,
      shap_calc_type=EShapCalcType.Regular,

    )
    summaries.append(summary)

  return summaries

In [None]:
# 10 features_to_select no enough boost
# 8 features_to_select quite okay
summaries = select_features_with_kfold(X, y, 8, 3)

In [None]:
selected_features = [summary['selected_features_names'] for summary in summaries]
indiv_selected_features = set(sum(selected_features, []))

indiv_selected_features

In [None]:
# first make a try on only binary features which have some kind of relation 
X = df[['type_DIVIDEND', 'type_INTEREST', 'type_INVESTMENT', 'type_PAYMENT', 
        'counterparty_filled_CN', 'counterparty_filled_FR', 'counterparty_filled_HK',
        'counterparty_filled_JP', 'customer_type_K', 'customer_type_P', 'customer_type_R']]
y = df['fraud_flag']


# For better score evaluation just hit this 5/10 or even more times
def build_model_on_binaries(model, X_train: pd.DataFrame, y_train: pd.Series, 
                            threshold: float = None, log: bool = False) -> Tuple[float, float]:

  pipeline = make_pipeline(model)

  k_fold = StratifiedKFold(n_splits=5, shuffle=True)
  folds = k_fold.split(X_train, y_train)


  for k, (train, test) in enumerate(folds, start=1):
      pipeline.fit(X_train.iloc[train, :], y_train.iloc[train])
      train_preds = pipeline.predict(X_train.iloc[train, :])
      train_score = balanced_accuracy_score(y_train.iloc[train].values, train_preds)
      train_scores.append(train_score)

      if threshold:
        y_preds = (pipeline.predict_proba(X_train.iloc[test, :])[:, 1] >= threshold).astype(bool)
      else:
        y_preds = pipeline.predict(X_train.iloc[test, :])
        
      cm = confusion_matrix(y_train.iloc[test].values, y_preds)

      score = balanced_accuracy_score(y_train.iloc[test].values, y_preds)
      scores.append(score)

      recall = cm[1][1]/(cm[1][1]+cm[1][0])
      recalls.append(recall)
      precision = cm[1][1]/(cm[0][1]+cm[1][1])

      if log:
        print(f'Fold: {k}, train_balanced_accuracy: {train_score:.3f},  balanced_accuracy: {score:.3f}, ' +
              f'precision: {precision:.3f}, recall: {recall:.3f}')
        print(cm)
        print()

        y_preds_proba = pipeline.predict_proba(X_train.iloc[test, :])[:,1]
        bc.curve_ROC_plot(true_y=y_train.iloc[test].values, predicted_proba=y_preds_proba)
        print()

  pipeline.mean_train_score = np.mean(train_scores)
  pipeline.mean_score = np.mean(scores)
  pipeline.mean_recall = np.mean(recalls)

  if log:
    print('\n\nbalanced_accuracy: %.3f +/- %.3f' %(pipeline.mean_score, np.std(scores)))
    print('recall: %.3f +/- %.3f' %(pipeline.mean_recall, np.std(recalls)))

  return pipeline.mean_train_score, pipeline.mean_score, pipeline.mean_recall

In [None]:
def train_catboost_on_cv(X, y, threshold=None, log=None):
  k_fold = StratifiedKFold(n_splits=5, shuffle=True)
  folds = k_fold.split(X, y)

  train_scores = []
  scores = []
  recalls = []

  params = {
    'iterations': 500,
    'loss_function': 'Logloss',
    'auto_class_weights': 'Balanced',
    'custom_metric': ['Recall', 'BalancedAccuracy'],
    'verbose': False,
  }

  for k, (train, test) in enumerate(folds, start=1):
    train_pool = Pool(X.iloc[train, :], y.iloc[train])
    test_pool = Pool(X.iloc[test, :], y.iloc[test])

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=test_pool, use_best_model=True, verbose=False) 

    train_preds = model.predict(X.iloc[train, :])
    train_score = balanced_accuracy_score(y.iloc[train].values, train_preds)
    train_scores.append(train_score)
    
    if threshold:
      y_preds = (model.predict_proba(X.iloc[test, :])[:, 1] >= threshold).astype(bool)
    else:
      y_preds = model.predict(X.iloc[test, :])
      
    cm = confusion_matrix(y.iloc[test].values, y_preds)

    score = balanced_accuracy_score(y.iloc[test].values, y_preds)
    scores.append(score)

    recall = cm[1][1]/(cm[1][1]+cm[1][0])
    recalls.append(recall)
    precision = cm[1][1]/(cm[0][1]+cm[1][1])

    if log:
      print(f'Fold: {k}, train_balanced_accuracy: {train_score:.3f},  balanced_accuracy: {score:.3f}, ' +
            f'precision: {precision:.3f}, recall: {recall:.3f}')
      print(cm)
      print()

    mean_train_score = np.mean(train_scores)
    mean_score = np.mean(scores)
    mean_recall = np.mean(recalls)

  return mean_train_score, mean_score, mean_recall

In [None]:
# quite nice
train_catboost_on_cv(X[indiv_selected_features], y, log=True, threshold=0.3)

In [None]:
train_scores = []
scores = []
recalls = []

# 30 times probably enough to get valid scores
# let's try to beat this score
for i in tqdm(range(30)):
  mean_train_score, mean_score, mean_recall = train_catboost_on_cv(X[indiv_selected_features], y, log=False, threshold=0.26)
  train_scores.append(mean_train_score)
  scores.append(mean_score)
  recalls.append(mean_recall)
print()
print(f'Mean train balanced_acc: {np.mean(train_scores)}', f'Mean balanced_acc: {np.mean(scores)}', f'Mean recall: {np.mean(recalls)}')

# TODOS + Final Notes

In [None]:
# Model can be improved with some hyperparameter changes and better features selection 
# Probably some more column dropping and research on shap values can improve the final score

# Final model fitted on the whole dataset would be quite resilient for different and new clients. 
# In final dataframe there is no information about client numbers and counterparties

# We could also check vif score for multicollinearity between variables

# We should go back to initial Tree and try different predict_proba thresholds

# Model seem to be overfitting to training data, still big gap between train_mean_score and test_score
# Prune estimators, change max_leaf_nodes, min_obs_per_leaf, depth for better generalization