In [1]:
import numpy as np
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
data = pd.read_csv("data/data.csv", index_col=0)

In [None]:
data = data.set_index('name')

In [None]:
X, y = data.drop(columns=['target']), data['target']

In [None]:
clfs = []

In [None]:
params = {
            'iterations':1000,
            'learning_rate':0.05,
            'depth':7,
            'eval_metric':'F1'
}

tprs = []
fprs = []
aucs = []
recall_arr = np.array([])
f1_arr = np.array([])
precision_arr = np.array([])
clfs = []
mean_fpr = np.linspace(0,1,100)
i = 1
plt.clf()
n_fold = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True)

for fold, (train_index, val_index) in enumerate(cv.split(X, y)):
    X_train_fold = X.iloc[train_index]
    X_val_fold = X.iloc[val_index]

    y_train_fold = y.iloc[train_index]
    y_val_fold = y.iloc[val_index]

    clf = CatBoostClassifier(**params)

    train_dataset = Pool(data=X_train_fold, label=y_train_fold)
    eval_dataset = Pool(data=X_val_fold, label=y_val_fold)

    clf.fit(train_dataset, verbose = 1)
    
    clf.save_model(f'{fold}_clf.cbm')
    clfs.append(clf)

    y_pred = clf.predict(Pool(data=X_val_fold))
    y_pred_proba = clf.predict_proba(Pool(data=X_val_fold))

    fpr, tpr, t = roc_curve(y_val_fold, y_pred_proba[:,1])

    tprs.append(tpr)
    fprs.append(fpr)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)


    recall_arr = np.append(recall_arr, sklearn.metrics.recall_score(y_val_fold, y_pred))
    f1_arr = np.append(f1_arr, sklearn.metrics.f1_score(y_val_fold, y_pred))
    precision_arr = np.append(precision_arr, sklearn.metrics.precision_score(y_val_fold, y_pred))
    clfs.append(clf)

    print(f"f1: {sklearn.metrics.f1_score(y_val_fold, y_pred)}")
    print(f"precision: {sklearn.metrics.precision_score(y_val_fold, y_pred)}")
    print(f"recall: {sklearn.metrics.recall_score(y_val_fold, y_pred)}")

print(f"f1: {round(f1_arr.mean(),3)}")
print(f"precision: {round(precision_arr.mean(),3)}")
print(f"recall: {round(recall_arr.mean(),3)}")

In [None]:
plt.figure(figsize=(10,10), dpi=90)
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
for i in range(len(fprs)):
    plt.plot(fprs[i], tprs[i], lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.5f)' % (i+1, aucs[i]))


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(' ROC')
plt.legend(loc="lower right")
plt.savefig("5s_catboost_roc.png")

In [None]:
clf.save_model('window_detection.cbm')