# Settings

In [1]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [2]:
# Since the project folder is called "crm-vip-data-scientist-take-home-challenge"
# Set this notebook to the project folder as working directory 

import os

current_working_dir = os.path.normpath(os.getcwd()).split(os.path.sep)[-1]

if current_working_dir != 'crm_project':
    os.chdir('..')

In [3]:
import pandas as pd
import sidetable

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Load datasets, specifying the customer IDs as index
df_train = pd.read_csv('data/train_set.csv', index_col='mk_CurrentCustomer')
df_customer_country = pd.read_csv('data/customer_country.csv', index_col='mk_CurrentCustomer')

# Convert date columns to date specifying the format (it's much faster when specifying it)
df_train['ScoreDate'] = pd.to_datetime(df_train['ScoreDate'], format='%d/%m/%Y %H:%M')

# Merging the datasets
# Note: Since there are no duplicated IDs (see next cells), it's safe to merge it here
df = df_train.join(df_customer_country, how='left')

# Show quantity of rows, columns, and first rows
print(df.shape)
display(df.head())

(55415, 270)


Unnamed: 0_level_0,ScoreDate,days_g10,days_g9,days_g8,days_g7,days_g6,days_g5,days_g4,days_g3,days_g2,...,days_since_last_SE_GI,SE_GI_max_datediff,SE_GI_min_datediff,SE_GI_avg_datediff,SE_GI_std_datediff,days_since_last_SE_GI_wrt_max,days_since_last_SE_GI_wrt_min,days_since_last_SE_GI_wrt_avg,days_since_last_SE_GI_wrt_std,country
mk_CurrentCustomer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10871718,2019-08-29,2,0,2,3,3,3,2,5,4,...,15,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
6818142,2019-08-29,4,3,2,5,3,3,5,4,3,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
15585115,2019-08-29,4,1,2,4,7,4,1,0,0,...,18,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Spain
12544601,2019-08-29,2,3,7,4,1,2,0,2,0,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Malta
14695611,2019-08-29,5,1,4,4,3,4,4,3,4,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Italy


# EDA

## Quality checks

In [5]:
# Checking duplicated customers on both files (that's how we know it's safe to merge them)
df_train.index.duplicated().any() | df_customer_country.index.duplicated().any()

False

In [6]:
# Checking if amount of rows from the train data remains
df_train.shape[0] == df.shape[0]

True

In [7]:
# Check missing values
df.isna().any().any()

False

In [8]:
# Check target proportion
df.stb.freq(['target'])

Unnamed: 0,target,count,percent,cumulative_count,cumulative_percent
0,0,54474,98.301904,54474,98.301904
1,1,941,1.698096,55415,100.0


In [9]:
df

Unnamed: 0_level_0,ScoreDate,days_g10,days_g9,days_g8,days_g7,days_g6,days_g5,days_g4,days_g3,days_g2,...,days_since_last_SE_GI,SE_GI_max_datediff,SE_GI_min_datediff,SE_GI_avg_datediff,SE_GI_std_datediff,days_since_last_SE_GI_wrt_max,days_since_last_SE_GI_wrt_min,days_since_last_SE_GI_wrt_avg,days_since_last_SE_GI_wrt_std,country
mk_CurrentCustomer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10871718,2019-08-29,2,0,2,3,3,3,2,5,4,...,15,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
6818142,2019-08-29,4,3,2,5,3,3,5,4,3,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
15585115,2019-08-29,4,1,2,4,7,4,1,0,0,...,18,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Spain
12544601,2019-08-29,2,3,7,4,1,2,0,2,0,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Malta
14695611,2019-08-29,5,1,4,4,3,4,4,3,4,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Italy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7831140,2019-08-29,0,5,2,0,0,1,3,0,2,...,1690,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Germany
11118841,2019-08-29,2,1,6,4,3,1,3,4,5,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Malta
6332527,2019-08-29,4,2,2,7,3,3,3,3,7,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
13673326,2019-08-29,6,2,0,0,1,1,0,0,0,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France


In [10]:
# Check columns with constant value (we'll remove them from analysis/modeling)
columns_with_constant = df.agg('nunique').eq(1)
columns_with_constant[columns_with_constant].index.tolist()

['ScoreDate']

In [11]:
df.loc[:,'days_g10':'days_since_last_SE_GI_wrt_std'].corr()

Unnamed: 0,days_g10,days_g9,days_g8,days_g7,days_g6,days_g5,days_g4,days_g3,days_g2,days_g1,...,SE_GI_wrt_days_70days,days_since_last_SE_GI,SE_GI_max_datediff,SE_GI_min_datediff,SE_GI_avg_datediff,SE_GI_std_datediff,days_since_last_SE_GI_wrt_max,days_since_last_SE_GI_wrt_min,days_since_last_SE_GI_wrt_avg,days_since_last_SE_GI_wrt_std
days_g10,1.000000,0.530455,0.443198,0.401873,0.392187,0.361366,0.333157,0.318389,0.317353,0.310968,...,-0.004328,0.033898,0.007936,0.005634,0.007627,0.007321,0.008708,0.011806,0.008266,0.000606
days_g9,0.530455,1.000000,0.600609,0.479880,0.422910,0.413942,0.384667,0.349378,0.334951,0.329037,...,-0.004001,0.036691,0.015020,0.007985,0.012302,0.014193,0.007218,0.013479,0.006660,-0.000736
days_g8,0.443198,0.600609,1.000000,0.619125,0.505641,0.465843,0.444731,0.408479,0.379623,0.361442,...,-0.005154,0.024237,0.004599,0.004740,0.005202,0.003589,0.001749,0.010408,0.001371,0.000369
days_g7,0.401873,0.479880,0.619125,1.000000,0.657863,0.553417,0.502346,0.485176,0.458544,0.407018,...,-0.004799,0.022722,-0.005307,-0.000090,-0.003325,-0.007010,-0.002063,0.008620,-0.001753,0.000254
days_g6,0.392187,0.422910,0.505641,0.657863,1.000000,0.669346,0.554213,0.503202,0.490843,0.455086,...,-0.004539,0.017869,-0.008583,-0.000098,-0.004769,-0.011080,0.000175,0.006994,0.000243,-0.000214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SE_GI_std_datediff,0.007321,0.014193,0.003589,-0.007010,-0.011080,-0.011491,-0.014502,-0.017065,-0.019130,-0.019034,...,0.002762,0.138275,0.777656,0.153023,0.511004,1.000000,0.019398,0.156426,0.029051,0.016177
days_since_last_SE_GI_wrt_max,0.008708,0.007218,0.001749,-0.002063,0.000175,0.002442,0.004471,0.003378,0.006106,0.004793,...,0.000288,0.159405,0.033323,0.031254,0.035072,0.019398,1.000000,0.516934,0.985832,0.360303
days_since_last_SE_GI_wrt_min,0.011806,0.013479,0.010408,0.008620,0.006994,0.007684,0.012424,0.003573,0.007681,0.006214,...,0.000079,0.205520,0.139175,0.011489,0.071790,0.156426,0.516934,1.000000,0.583769,0.258934
days_since_last_SE_GI_wrt_avg,0.008266,0.006660,0.001371,-0.001753,0.000243,0.002476,0.004657,0.002716,0.005750,0.005113,...,0.000224,0.172057,0.040785,0.029805,0.037705,0.029051,0.985832,0.583769,1.000000,0.446196


In [12]:
df

Unnamed: 0_level_0,ScoreDate,days_g10,days_g9,days_g8,days_g7,days_g6,days_g5,days_g4,days_g3,days_g2,...,days_since_last_SE_GI,SE_GI_max_datediff,SE_GI_min_datediff,SE_GI_avg_datediff,SE_GI_std_datediff,days_since_last_SE_GI_wrt_max,days_since_last_SE_GI_wrt_min,days_since_last_SE_GI_wrt_avg,days_since_last_SE_GI_wrt_std,country
mk_CurrentCustomer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10871718,2019-08-29,2,0,2,3,3,3,2,5,4,...,15,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
6818142,2019-08-29,4,3,2,5,3,3,5,4,3,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
15585115,2019-08-29,4,1,2,4,7,4,1,0,0,...,18,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Spain
12544601,2019-08-29,2,3,7,4,1,2,0,2,0,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Malta
14695611,2019-08-29,5,1,4,4,3,4,4,3,4,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Italy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7831140,2019-08-29,0,5,2,0,0,1,3,0,2,...,1690,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Germany
11118841,2019-08-29,2,1,6,4,3,1,3,4,5,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,Malta
6332527,2019-08-29,4,2,2,7,3,3,3,3,7,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France
13673326,2019-08-29,6,2,0,0,1,1,0,0,0,...,-1,-1,-1,-1,-1.0,-1,-1,-1,-1.0,France


## Filter data

In [13]:
numeric_features = df.drop(columns=['ScoreDate','country','target']).columns.tolist()
categorical_features = ['country']

In [14]:
df = df[numeric_features+categorical_features+['target']]

## Plots

In [None]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif

In [None]:
def plot_heatmap(df_corr, mask_threshold=None, title=''):
    # Create mask
    mask = np.zeros(df_corr.shape).astype(bool)
    mask[np.triu_indices_from(mask)] = True

    # Show only significant correlations given target threshold
    if mask_threshold:
        mask = mask | (df_corr.abs() < mask_threshold).values

    sns.heatmap(
        df_corr, cmap='RdBu_r', mask=mask, square=False, center=0, vmin=-1, vmax=1, linewidths=1,
        annot=False, fmt=".2f", annot_kws={"fontsize":"small"},
        cbar=True, cbar_kws={'orientation':'vertical', 'shrink': .5}
    )

    plt.title(title, weight='bold')

In [None]:

dfc

In [None]:
dfc = df.filter(regex='days_g|target').corr()
plot_heatmap(dfc, mask_threshold=0.3, title='Number of days with activity during week')

In [None]:
dfc= df.filter(regex='^ro_g|target').corr()
plot_heatmap(dfc, mask_threshold=0.3, title='Number of game rounds during week')

In [None]:
dfc = df.filter(regex='^to_g|target').corr()
plot_heatmap(dfc, mask_threshold=0.3, title='Total turnover amount in EUR during week')

In [None]:
df.filter(regex='^to_g|target')

In [None]:
from sklearn.preprocessing import power_transform

In [None]:
df.filter(regex='^to_g|target').head(2)

In [None]:
power_transform(df[['to_g10']])

In [None]:
(
    pd.DataFrame(power_transform(df.filter(regex='^to_g')), index=df.filter(regex='^to_g').index, columns=df.filter(regex='^to_g').columns)
    .boxplot()
)

In [None]:
df.filter(regex='^to_g|target').apply(lambda x: power_transform(x).reshape(-1, 1))

In [None]:
dfc = df.filter(like='days_g').corr()

mask = np.zeros(dfc.shape).astype(bool)
mask[np.triu_indices_from(mask)] = True

# Show only significant correlations given target threshold
mask = mask | (dfc.abs() < 0.3).values

In [None]:
sns.heatmap(
    dfc, cmap='RdBu_r', mask=mask, square=False, center=0, vmin=-1, vmax=1, linewidths=1,
    annot=False, fmt=".2f", annot_kws={"fontsize":"small"},
    cbar=True, cbar_kws={'orientation':'vertical', 'shrink': .5}
)

In [None]:
numeric_features

In [None]:
df_plot = df.groupby('target')['days_g10'].value_counts(normalize=True).to_frame('pct').reset_index()
df_plot

In [None]:
sns.barplot(data=df_plot, x='target', y='pct', hue='days_g10')

In [None]:
sns.barplot(data=df_plot, x='days_g10', y='pct', hue='target')

In [None]:
numeric_features = df

In [None]:
mi = mutual_info_classif(X[NUMERICAL_FEATURES], y)

In [None]:
pd.Series(mi, index=NUMERICAL_FEATURES).sort_values(ascending=False).head(30)

In [None]:
import numpy as np

In [None]:
df

In [None]:
# Correlation
dfc = df.loc[:,'days_g10':'days_since_last_SE_GI_wrt_std'].corr(method='spearman')

# Plot
plt.figure(figsize=(21,16))

mask = np.zeros(dfc.shape).astype(bool)
mask[np.triu_indices_from(mask)] = True

# Show only significant correlations given target threshold
mask = mask | (dfc.abs() < 0.9).values

ax = sns.heatmap(
    dfc, cmap='RdBu_r', mask=mask, square=False, center=0, vmin=-1, vmax=1, linewidths=1,
    annot=False, fmt=".2f", annot_kws={"fontsize":"small"},
    cbar=True, cbar_kws={'orientation':'vertical', 'shrink': .5}
)

# ax.set_xticks(np.arange(dfc.shape[1]) + 0.5, minor=False)
# ax.set_yticks(np.arange(dfc.shape[0]) + 0.5, minor=False)

# ax.set_xticklabels(ax.get_xticklabels())
# ax.tick_params(axis='both', which='both', labelsize=8)

plt.xticks(fontsize='small')
plt.yticks(fontsize='small') 

plt.show()

In [None]:
df.columns

In [None]:
sns.displot(data=df, x='days_since_last_SE_GI', col='target', facet_kws=dict(sharey=False), kde=True)

In [None]:
a

# Preprocessing

In [20]:
df_model = df.copy()

In [21]:
TARGET = 'target'

NUMERICAL_FEATURES = df_model.drop(columns=TARGET).select_dtypes('number').columns.tolist()
CATEGORICAL_FEATURES = df_model.drop(columns=TARGET).select_dtypes(exclude = 'number').columns.tolist()
FEATURES = NUMERICAL_FEATURES+CATEGORICAL_FEATURES

In [23]:
X = df_model[FEATURES]
y = df_model[TARGET]

## Feature selection

In [None]:
from boruta import BorutaPy

# Modeling

In [94]:
from src.utils import clf_metrics

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, PowerTransformer, StandardScaler, RobustScaler

from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

from lightgbm import LGBMClassifier

## Split data

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [92]:
preprocessor = ColumnTransformer([
    ('num', make_pipeline(PowerTransformer(), PCA(n_components=.99)), NUMERICAL_FEATURES),
    ('cat', OneHotEncoder(drop='if_binary'), CATEGORICAL_FEATURES)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(task_type='GPU', devices='0', random_state=42))
])

In [None]:
model_pipeline.fit(X_train, y_train)

In [None]:
model_name = 'lr'
model = LogisticRegression(max_iter=10_000, class_weight='balanced')
# model = CatBoostClassifier()

In [95]:
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print(class_weights)

{0: 0.5087211970990544, 1: 29.16578947368421}


In [100]:
y_train.value_counts(normalize=True)

target
0    0.982857
1    0.017143
Name: proportion, dtype: float64

In [98]:
43572 / 760

57.33157894736842

In [101]:
threshold = 0.5

models = dict(
    lr = LogisticRegression(max_iter=10_000, class_weight='balanced'),
    # rf = RandomForestClassifier(class_weight='balanced', random_state=42),
    lgbm = LGBMClassifier(random_state=42),
    lgbm_scaled = LGBMClassifier(scale_pos_weight=5, random_state=42),
    catboost = CatBoostClassifier(silent=True, task_type='GPU', random_state=42),
    catboost_weighted = CatBoostClassifier(class_weights=[0.02,0.98], silent=True, task_type='GPU', random_state=42),
    catboost_weighted_custom = CatBoostClassifier(class_weights=class_weights, silent=True, task_type='GPU', random_state=42),
)

In [102]:
preprocessor = ColumnTransformer([
    ('num', make_pipeline(PowerTransformer(), PCA(n_components=20)), NUMERICAL_FEATURES),
    ('cat', OneHotEncoder(drop='if_binary'), CATEGORICAL_FEATURES)
])

In [103]:
results = []
probability_results = {}

for i,(model_name,model) in enumerate(models.items(), start=1):
    print(f'{i}/{len(models)} - {model_name}')
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    model_pipeline.fit(X_train, y_train)

    y_pred_proba = model_pipeline.predict_proba(X_test)
    y_pred = y_pred_proba[:,1]>=threshold

    probability_results[model_name] = y_pred_proba

    result = clf_metrics(y_test, y_pred, y_pred_proba[:,1])
    result['model'] = model_name
    results.append(result)

1/6 - lr
2/6 - lgbm
[LightGBM] [Info] Number of positive: 760, number of negative: 43572
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5114
[LightGBM] [Info] Number of data points in the train set: 44332, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017143 -> initscore=-4.048852
[LightGBM] [Info] Start training from score -4.048852
3/6 - lgbm_scaled
[LightGBM] [Info] Number of positive: 760, number of negative: 43572
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5114
[LightGBM] [Info] Number of data points in the train set: 44332, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017143 -> initscore=-4.048852
[LightGBM] [Info] Start training

In [104]:
pd.DataFrame(results).set_index('model').sort_values('Recall', ascending=False)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,Recall,Precison,F1,ROC_AUC
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,0.835424,0.723469,0.607735,0.059045,0.107632,0.797054
catboost_weighted_custom,0.875395,0.719338,0.558011,0.07204,0.127606,0.783952
catboost_weighted,0.913381,0.695181,0.469613,0.089568,0.150442,0.795752
lgbm_scaled,0.97248,0.537778,0.088398,0.102564,0.094955,0.767579
lgbm,0.982496,0.504837,0.01105,0.117647,0.020202,0.795623
catboost,0.983488,0.499908,0.0,0.0,0.0,0.809699


In [118]:
threshold = .25
metric_results = []

for model_name in models:
    y_pred_proba = probability_results[model_name][:,1]
    y_pred = y_pred_proba >= threshold
    metric_result = clf_metrics(y_test, y_pred, y_pred_proba)
    metric_result['model'] = model_name
    metric_results.append(metric_result)

pd.DataFrame(metric_results).set_index('model').sort_values('Recall', ascending=False)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,Recall,Precison,F1,ROC_AUC
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,0.46648,0.663614,0.867403,0.025968,0.050426,0.797054
catboost_weighted_custom,0.605161,0.704223,0.80663,0.032538,0.062554,0.783952
catboost_weighted,0.799874,0.716266,0.629834,0.050331,0.093213,0.795752
lgbm_scaled,0.956059,0.594629,0.220994,0.103627,0.141093,0.767579
lgbm,0.978345,0.529893,0.066298,0.144578,0.090909,0.795623
catboost,0.98033,0.514602,0.033149,0.122449,0.052174,0.809699


## Hyper-parameter tuning

In [127]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    # PowerTransformer or None
    use_power_transform = trial.suggest_categorical('use_power_transform', [True, False])
    
    # With or without PCA
    use_pca = trial.suggest_categorical('use_pca', [True, False])
    n_components = trial.suggest_float('n_components', 0.90, 0.99) if use_pca else None
    
    # CatBoost parameters
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    depth = trial.suggest_int('depth', 4, 10)
    
    # Numerical Preprocessor Pipeline
    num_pipeline_steps = []
    if use_power_transform:
        num_pipeline_steps.append(('power_transform', PowerTransformer()))
    if use_pca:
        num_pipeline_steps.append(('pca', PCA(n_components=n_components)))
    
    num_preprocessor = make_pipeline(*num_pipeline_steps) if num_pipeline_steps else 'passthrough'
    
    # ColumnTransformer for preprocessing
    preprocessor = ColumnTransformer([
        ('num', num_preprocessor, NUMERICAL_FEATURES),
        ('cat', OneHotEncoder(drop='if_binary'), CATEGORICAL_FEATURES)
    ])
    
    # Pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            task_type='GPU', devices='0:1', random_state=42,
            learning_rate=learning_rate, depth=depth, verbose=False,
            border_count=128, early_stopping_rounds=50
            )
        )
    ])
    
    # Cross-validation score
    score = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='recall', n_jobs=-1).mean()
    
    return score

In [130]:
study = optuna.create_study(direction='maximize', study_name='distributed_catboost', storage='sqlite:///example.db', load_if_exists=True)
study.optimize(objective, n_trials=2)  # Adjust n_trials to your preference

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-04-11 04:32:09,877] Using an existing study with name 'distributed_catboost' instead of creating a new one.


[W 2024-04-11 04:43:12,700] Trial 2 failed with parameters: {'use_power_transform': False, 'use_pca': False, 'learning_rate': 0.28843423863234885, 'depth': 6} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/cmcouto-silva/miniconda3/envs/crm38/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_183994/3638329595.py", line 43, in objective
    score = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='recall', n_jobs=-1).mean()
  File "/home/cmcouto-silva/miniconda3/envs/crm38/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 562, in cross_val_score
    cv_results = cross_validate(
  File "/home/cmcouto-silva/miniconda3/envs/crm38/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 214, in wrapper
    return func(*args, **kwargs)
  File "/home/cmcouto-silva/miniconda3/envs/crm38/lib/python3.8/sit

KeyboardInterrupt: 

In [86]:
# Parameter space to search
search_spaces = {
    'preprocessor__num': Categorical([StandardScaler(), RobustScaler(), PowerTransformer()]),
    'model__depth': Integer(4, 16),
    'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
}

In [87]:
# BayesSearchCV
opt = BayesSearchCV(
    estimator=model_pipeline,
    search_spaces=search_spaces,
    n_iter=2,  # Number of parameter settings sampled. Increase this for better results.
    scoring='recall',  # Change or add scoring metrics as needed
    cv=4,  # Cross-validation splitting strategy
    n_jobs=-1,  # Number of jobs to run in parallel
    return_train_score=True,
    random_state=42, 
)

In [88]:
opt.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Results
print(f"Best score: {opt.best_score_}")
print("Best parameters found:")
print(opt.best_params_)

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
fig,ax = plt.subplots()

PrecisionRecallDisplay.from_predictions(
    y_test, y_pred, name="Catboost", ax=ax
)

ax.legend(loc='upper right')

ax.set_title('hey')

In [None]:
y_pred_proba = model_pipeline.predict_proba(X_test)
y_pred = y_pred_proba[:,1]>=threshold

probability_results[model_name] = y_pred_proba

In [None]:
result = clf_metrics(y_test, y_pred)
result['model'] = model_name
result

In [None]:
y_pred = model_pipeline.predict_proba(X_test)[:,1]>=0.3
clf_metrics(y_test, y_pred)

In [None]:
pca = PCA(n_components=.99)
pca.fit(X[NUMERICAL_FEATURES])

df_pca = pd.DataFrame(
    {
        'var': pca.explained_variance_ratio_,
        'cum_var': pca.explained_variance_ratio_.cumsum()
    },
    index=[f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))]
)

df_pca

In [None]:
# preprocessor = ColumnTransformer([
#     ('num', 'passthrough', NUMERICAL_FEATURES),
#     ('cat', OneHotEncoder(drop='if_binary'), CATEGORICAL_FEATURES)
# ])

# model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', RandomForestClassifier(class_weight='balanced'))
# ])

preprocessor = ColumnTransformer([
    # ('num', PCA(n_components=.99), NUMERICAL_FEATURES),
    ('num', StandardScaler(), NUMERICAL_FEATURES),
    ('cat', OneHotEncoder(drop='if_binary'), CATEGORICAL_FEATURES)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=10_000, class_weight='balanced'))
])

In [None]:
model_pipeline.fit(X_train, y_train)

In [None]:
y_pred = model_pipeline.predict_proba(X_test)[:,1]>=0.3
clf_metrics(y_test, y_pred)

In [None]:
feature_names = model_pipeline[0].get_feature_names_out()
feature_importances = model_pipeline[1].feature_importances_

In [None]:
pd.Series(feature_importances, index=feature_names).sort_values().plot.barh(figsize=(8,42))