In [2]:
import pandas as pd
import numpy as np
import catboost
import optuna
import pickle
import os
from scipy import sparse

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [4]:
from catboost import CatBoostClassifier, Pool
from optuna.integration import CatBoostPruningCallback
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [33]:
import os
import matplotlib.pyplot as plt#visualization
%matplotlib inline
import seaborn as sns#visualization
import plotly.offline as py #visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot

In [6]:
X_train_tfidf_minmax = sparse.load_npz("../data/X_train_tfidf_minmax.npz")
X_test_tfidf_minmax = sparse.load_npz("../data/X_test_tfidf_minmax.npz")
#####
X_test_tfidf_minmax_f = sparse.load_npz("../data/X_test_tfidf_minmax_f.npz")
#####
y_train_np = np.load('../data/y_train.npy')
y_test_np = np.load('../data/y_test.npy')

In [7]:
### SVD
svd = pickle.load(open('../classical_ML/TruncatedSVD_500.pickle', 'rb'))

In [8]:
X_train_svd = svd.transform(X_train_tfidf_minmax)
X_test_svd = svd.transform(X_test_tfidf_minmax)
X_test_svd_f = svd.transform(X_test_tfidf_minmax_f)

In [9]:
print(X_train_svd.shape)
print(X_test_svd.shape)
print(X_test_svd_f.shape)

(5329, 500)
(2284, 500)
(3263, 500)


In [10]:
def objective(trial):
    global gbm
    train_x = X_train_svd
    valid_x = X_test_svd
    train_y = y_train_np
    valid_y = y_test_np
    
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 6),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "eval_metric": "F1",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)
    
    gbm = CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "F1")
    
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )
    trial.set_user_attr(key="best_booster", value=gbm)
    # evoke pruning manually.
    pruning_callback.check_pruned()
    
    preds = gbm.predict(valid_x)
    f1 = f1_score(valid_y, preds)
    return f1

In [16]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_booster", value=trial.user_attrs["best_booster"])

In [17]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize")
study.optimize(objective, n_trials=150, timeout=600, callbacks=[callback])

[32m[I 2022-09-13 02:14:42,962][0m A new study created in memory with name: no-name-edd22a6e-5ae8-419a-9868-75d1e1d781e8[0m

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[32m[I 2022-09-13 02:14:49,320][0m Trial 0 finished with value: 0.6862527716186253 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.015979932738294494, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.10588913580709412}. Best is trial 0 with value: 0.6862527716186253.[0m

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[32m[I 2022-09-13 02:15:02,043][0m Trial 1 finished with value: 0.7176470588235294 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.09501069003391607, 'depth': 2, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.3037408364617398}. Best is trial 1 with value: 0.7176470588235294.

[32m[I 2022-09-13 02:17:41,772][0m Trial 24 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2022-09-13 02:17:42,986][0m Trial 25 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2022-09-13 02:17:53,741][0m Trial 26 finished with value: 0.7304159913560238 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08411934216733007, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 13 with value: 0.7421584263689527.[0m

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[32m[I 2022-09-13 02:17:56,088][0m Trial 27 pruned. Trial was pruned at iteration 107.[0m
[32m[I 2022-09-13 02:17:57,564][0m Trial 28 pruned. Trial was pruned at iteration 9.[0m
[32m[I 2022-09-13 02:17:58,836][0m Trial 29 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2022-09-13 02:18:00,800][0m Trial 30 pruned. Trial was pruned at iteration 77.[0m
[32m[I 2022-09-13 02:18:11,760][0m Trial 31 finished wit

[32m[I 2022-09-13 02:20:32,943][0m Trial 84 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2022-09-13 02:20:34,207][0m Trial 85 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2022-09-13 02:20:35,722][0m Trial 86 pruned. Trial was pruned at iteration 43.[0m
[32m[I 2022-09-13 02:20:37,037][0m Trial 87 pruned. Trial was pruned at iteration 6.[0m
[32m[I 2022-09-13 02:20:38,394][0m Trial 88 pruned. Trial was pruned at iteration 7.[0m
[32m[I 2022-09-13 02:20:47,208][0m Trial 89 finished with value: 0.7335865436787846 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.06996399833762343, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 13 with value: 0.7421584263689527.[0m

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[32m[I 2022-09-13 02:20:48,292][0m Trial 90 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2022-09-13 02:20:49,589][0m Trial 91 pruned. Trial 

In [18]:
best_model=study.user_attrs["best_booster"]
print(best_model.get_params())

{'depth': 5, 'eval_metric': 'F1', 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'colsample_bylevel': 0.05228322690351833, 'objective': 'Logloss'}


In [20]:
print('F1 score: ', f1_score(y_test_np, best_model.predict(X_test_svd)))
print('Accuracy: ', accuracy_score(y_test_np, best_model.predict(X_test_svd)))

F1 score:  0.7421584263689527
Accuracy:  0.7876532399299475


In [21]:
# Create df with all features and importance of them
df_features_importance = pd.DataFrame(
    {'feature_names': best_model.feature_names_,
     'feature_importances': best_model.feature_importances_
    })

In [22]:
#Check features with the highest importance. If one feature is more important than others it could be a leak
df_features_importance.sort_values('feature_importances', ascending=False)[:10]

Unnamed: 0,feature_names,feature_importances
1,1,5.759003
0,0,2.764169
22,22,2.502969
8,8,2.070422
18,18,1.994141
4,4,1.576262
9,9,1.371125
77,77,1.134851
2,2,1.047098
12,12,1.028431


In [24]:
y_predict_prob = best_model.predict_proba(X_test_svd)

In [25]:
## Chart of precision and recall for quantiles
def get_scores(y_true, y_pred, arange = np.arange(1, 100, 1), mode = True ):
    perc_lst = [np.percentile(y_pred, x) for x in arange]
    prec_lst = [precision_score(y_true, [1 if _ > x else 0 for _ in y_pred]) for x in perc_lst]
    recall_lst = [recall_score(y_true, [1 if _ > x else 0 for _ in y_pred]) for x in perc_lst]
    if mode == 'prob':
        graph_1 = []
        graph_1.append(go.Scatter(x = perc_lst, y = prec_lst, name = 'precision'))
        graph_1.append(go.Scatter(x = perc_lst, y = recall_lst, name = 'recall'))
        layout = (go.Layout(title = 'different metrics', xaxis = dict(title = 'probability_threshold'), 
                            yaxis = dict(title = 'Scores'))
                 )

        fig = go.Figure(data = graph_1, layout = layout)
        iplot(fig)

    else:

        graph_1 = []
        graph_1.append(go.Scatter(x = arange, y = prec_lst, name = 'precision'))
        graph_1.append(go.Scatter(x = arange, y = recall_lst, name = 'recall'))
        #graph_1.append(go.Scatter(x = arange, y = list(np.ones(len(arange))) * prob, name = 'random_precision'))
        layout = (go.Layout(title = 'different metrics', xaxis = dict(title = 'quantilites_threshold'), 

                            yaxis = dict(title = 'Scores'))

                 )

        fig = go.Figure(data = graph_1, layout = layout)
        iplot(fig) 
    return perc_lst, prec_lst, recall_lst

In [26]:
#functions for searching best threshold
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

def best_tresholds(y_pred_proba, y_test):
    # define thresholds
    thresholds = np.arange(0, 1, 0.001)
    # evaluate each threshold
    scores = [f1_score(y_test, to_labels(y_pred_proba, t)) for t in thresholds]
    # get best threshold
    ix = np.argmax(scores)
    #print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix], scores[ix]))
    return thresholds[ix], scores[ix]

In [28]:
## Let's find the best treshold
best_tresholds(y_predict_prob[:,1], y_test_np)

(0.47200000000000003, 0.7437759336099585)

In [42]:
## Also we can check the threshold on the chart. 
# The point where precision and recall curves are crossed is close to the best threshold
perc_lst, prec_lst, recall_lst = get_scores(y_test_np, y_predict_prob[:,1], mode='prob')

In [36]:
best_treshold=0.453
y_val = best_model.predict_proba(X_test_svd_f)[:,1]

In [40]:
### for kaggle
df_test_valid = pd.read_csv("../data/test.csv")
y_predict_val = best_model.predict(X_test_svd_f)
df_t = df_test_valid[['id']]
df_t['target'] = (y_val >= .5).astype(int)
df_t.to_csv("catboost_4.csv", index=False)

In [41]:
df_t.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.356727
std,3146.427221,0.479107
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0


In [None]:
### with treshold = .5 the result on kaggle is better =_=