In [3]:
%load_ext autoreload
%autoreload 2

import sys
sys.dont_write_bytecode = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
#imports
import os
import yaml
import warnings

import pandas as pd
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,roc_auc_score
from catboost import CatBoostClassifier

from callables import process_dataset,find_max_fscore

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://hb.bizmrg.com"
os.environ["MLFLOW_TRACKING_URI"] = "http://89.208.220.90:8000/"
mlflow.set_experiment("trading_model_anomaly_criterion")
warnings.filterwarnings('ignore')

In [5]:
#params
with open("params.yml", 'r') as file:
    params = yaml.safe_load(file)

root_path = params['root_path']
shift = params['shift']
anomaly_crtiretion_values = [1.0030, 1.0040, 1.0050]
cb_params = params['cb_params']

In [6]:
#callables
for anomaly_crtiretion in anomaly_crtiretion_values:

    mlflow.start_run()

    #________________________________________________________________________

    #read
    df = pd.read_csv(root_path + 'data/features.csv', index_col = 0)

    #process
    df_event, df_proc = process_dataset(df, shift, anomaly_crtiretion)

    #обучающая выборка
    data = df_proc.drop(['t_start', 't_end'], axis = 1)

    #________________________________________________________________________

    #split
    train, test = train_test_split(data, test_size = 0.2, random_state = 42, shuffle = False)

    x_train = train.drop('target', axis = 1)
    y_train = train.target.astype(int)

    x_test = test.drop('target', axis = 1)
    y_test = test.target.astype(int)

    #fit best model
    model = CatBoostClassifier(**cb_params)
    model.fit(x_train, y_train)

    # predict train probabilities
    y_train_pred_proba = model.predict_proba(x_train)
    y_train_pred_proba = y_train_pred_proba[:, 1]
    
    # predict test probabilities
    y_test_pred_proba = model.predict_proba(x_test)
    y_test_pred_proba = y_test_pred_proba[:, 1]

    #plot_find_max_fscore
    opt_cutoff = find_max_fscore(y_test, y_test_pred_proba)

    #calculate metrics
    gini_train = 2 * roc_auc_score(y_train, y_train_pred_proba) - 1
    gini_test = 2 * roc_auc_score(y_test, y_test_pred_proba) - 1
    f_score_train = f1_score(y_train, (y_train_pred_proba > opt_cutoff), pos_label=1, average='binary')
    f_score_test  = f1_score(y_test , (y_test_pred_proba  > opt_cutoff), pos_label=1, average='binary')

    #________________________________________________________________________

    # mlflow log params
    mlflow.log_param("shift", shift)
    mlflow.log_param("anomaly_crtiretion", anomaly_crtiretion)

    # mlflow log metrics
    mlflow.log_metric("gini_train", gini_train)
    mlflow.log_metric("gini_test", gini_test)
    mlflow.log_metric("f_score_train", f_score_train)
    mlflow.log_metric("f_score_test", f_score_test)

    #________________________________________________________________________

    mlflow.end_run()
    
    print(f'success_run_{anomaly_crtiretion}')