In [None]:
SHAP_ENABLED=False

In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import gc
import matplotlib
from sklearn.metrics import accuracy_score,precision_score,recall_score, confusion_matrix,roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier

if SHAP_ENABLED:
    import shap

In [None]:
import platform

SHOULD_GENERATE_IMAGES = platform.system() == 'Windows'
if "SHOULD_GENERATE_IMAGES" in os.environ:
    SHOULD_GENERATE_IMAGES = True

In [None]:
IS_HYPER_PARAMETER_SEARCH = False
NUM_FOLDS_CROSSVALIDATION = 10
NUM_TREES = 120
TREE_DEPTH = 5

DATA_OUTPUT_DIR = os.path.join('..','00.data','output')
DATA_HYPERPARAMETERS_DIR = os.path.join('..','00.data','hyperparameters')

In [None]:
if "IS_HYPER_PARAMETER_SEARCH" in os.environ:
    IS_HYPER_PARAMETER_SEARCH = bool(os.environ["IS_HYPER_PARAMETER_SEARCH"])
    
if "NUM_FOLDS_CROSSVALIDATION" in os.environ:
    NUM_FOLDS_CROSSVALIDATION = int(os.environ["NUM_FOLDS_CROSSVALIDATION"])
    
if "DATA_OUTPUT_DIR" in os.environ:
    DATA_OUTPUT_DIR = os.environ["DATA_OUTPUT_DIR"]
    
if "DATA_HYPERPARAMETERS_DIR" in os.environ:
    DATA_HYPERPARAMETERS_DIR = os.environ["DATA_HYPERPARAMETERS_DIR"]    
    
if "NUM_TREES" in os.environ:
    NUM_TREES = os.environ["NUM_TREES"]

if "TREE_DEPTH" in os.environ:
    TREE_DEPTH = os.environ["TREE_DEPTH"]

In [None]:
files_found = [x for x in os.listdir(DATA_OUTPUT_DIR) if x.endswith('.pickle')]
files_found_tokens=[x.split('.') for x in files_found]

models_found = {}

for current_model in files_found_tokens:
    model_name = '.'.join(current_model[:-2])
    filename = os.path.join(DATA_OUTPUT_DIR,'.'.join(current_model))
    if not model_name in models_found:
        models_found[model_name]= {}          
    models_found[model_name][current_model[-2]]=filename

print(models_found)

In [None]:
first_model = list(models_found.keys())[0]
current_total_dataset = joblib.load(models_found[first_model]['total'])
current_parameters = joblib.load(models_found[first_model]['parameters'])

In [None]:
df_current_total_dataset = pd.DataFrame(current_total_dataset)
df_current_total_dataset_x = df_current_total_dataset[current_parameters['CURRENT_X_COLUMNS']]
df_current_total_dataset_y_short = df_current_total_dataset[current_parameters['CURRENT_Y_COLUMN_SHORT']]
df_current_total_dataset_y_long = df_current_total_dataset[current_parameters['CURRENT_Y_COLUMN_LONG']]

total_total_count = len(df_current_total_dataset)
total_short_count = len([x for x in df_current_total_dataset_y_short if x == True])
total_long_count = len([x for x in df_current_total_dataset_y_long if x == True])
total_long_ratio = (total_total_count - total_long_count) / total_long_count
total_short_ratio = (total_total_count - total_short_count) / total_short_count

print(f'(train)Short Count:{total_short_count}/{total_total_count} {(total_short_count/total_total_count) * 100:.2f}%')
print(f'(train)Long Count:{total_long_count}/{total_total_count}  {(total_long_count/total_total_count) * 100:.2f}%')
print(f'(train)Long Ratio:{total_long_ratio:.2f}')
print(f'(train)Short Ratio:{total_short_ratio:.2f}')

In [None]:
CURRENT_EXCHANGE = current_parameters['CURRENT_EXCHANGE']
CURRENT_ASSET = current_parameters['CURRENT_ASSET']
CURRENT_TIMEFRAME = current_parameters['CURRENT_TIMEFRAME']
CURRENT_TARGET = current_parameters['CURRENT_TARGET']
CURRENT_STOP = current_parameters['CURRENT_STOP']
DECISION_BOUNDARY = current_parameters['DECISION_BOUNDARY']

In [None]:
if IS_HYPER_PARAMETER_SEARCH:
    folds = NUM_FOLDS_CROSSVALIDATION
    
    params_grid_search = {
            'objective' : ['binary:logitraw'],
            'eval_metric' : ['auc'],
            'tree_method' : ['exact'],
            'max_depth': [5],
            'gamma': [1],
            'eta': [0.3],
            'n_estimators' : list(range(50,200,5)),
            'scale_pos_weight' : [total_short_ratio]
    }
    
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

    grid_search_short = GridSearchCV(XGBClassifier(use_label_encoder=False), 
                                       param_grid=params_grid_search, 
                                       scoring='roc_auc', 
                                       n_jobs=-1, cv=skf.split(df_current_total_dataset_x,df_current_total_dataset_y_short), 
                                       verbose=3 )

    grid_search_short=grid_search_short.fit(df_current_total_dataset_x, df_current_total_dataset_y_short)        

    print('Short Best Params:')
    print('------------------')
    print(grid_search_short.best_params_)
    print('grid_search_short Best Estimator:')
    print('------------------')
    print(grid_search_short.best_estimator_)

    best_short_booster = grid_search_short.best_estimator_.get_booster()
    
    hyperparam_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.hyperparameters_short.xlsx")
    hyperparam_full_file_name = os.path.join(DATA_OUTPUT_DIR,hyperparam_file_name)

    results = pd.DataFrame(grid_search_short.cv_results_)
    results.to_excel(hyperparam_full_file_name, index=False)    

In [None]:
if not IS_HYPER_PARAMETER_SEARCH:
        
    best_short_booster = xgb.XGBClassifier(objective="binary:logitraw", 
                                  eval_metric="auc",
                                  tree_method="exact",
                                  max_depth=int(TREE_DEPTH),
                                  gamma=1,
                                  eta=0.3,
                                  subsample=1.0,
                                  colsample_bytree=1.0,
                                  colsample_bylevel=1.0,
                                  colsample_bynode=1.0,
                                  use_label_encoder=False,
                                  silent=False,                                           
                                  n_estimators=int(NUM_TREES),
                                  scale_pos_weight=total_short_ratio)
    
    best_short_booster.fit(df_current_total_dataset_x, df_current_total_dataset_y_short.to_numpy())         
    
    best_short_booster_score = best_short_booster.score(df_current_total_dataset_x, df_current_total_dataset_y_short.to_numpy())
    

In [None]:
if IS_HYPER_PARAMETER_SEARCH:
    folds = NUM_FOLDS_CROSSVALIDATION

    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

    grid_search_long = GridSearchCV(XGBClassifier(use_label_encoder=False), 
                                       param_grid=params_grid_search, 
                                       scoring='roc_auc', 
                                       n_jobs=-1, cv=skf.split(df_current_total_dataset_x,df_current_total_dataset_y_long), 
                                       verbose=3 )

    grid_search_long = grid_search_long.fit(df_current_total_dataset_x, df_current_total_dataset_y_long)
    
    print('Long Best Params:')
    print('------------------')
    print(grid_search_long.best_params_)
    print('Long Best Estimator:')
    print('------------------')
    print(grid_search_long.best_estimator_)

    best_long_booster = grid_search_long.best_estimator_.get_booster()   
    
    hyperparam_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.hyperparameters_long.xlsx")
    hyperparam_full_file_name = os.path.join(DATA_OUTPUT_DIR,hyperparam_file_name)

    results = pd.DataFrame(grid_search_long.cv_results_)
    results.to_excel(hyperparam_full_file_name, index=False)    

In [None]:
if not IS_HYPER_PARAMETER_SEARCH:
        
    best_long_booster = xgb.XGBClassifier(objective="binary:logitraw", 
                                  eval_metric="auc",
                                  tree_method="exact",
                                  max_depth=int(TREE_DEPTH),
                                  gamma=1,
                                  eta=0.3,
                                  subsample=1.0,
                                  colsample_bytree=1.0,
                                  colsample_bylevel=1.0,
                                  colsample_bynode=1.0,
                                  use_label_encoder=False,
                                  n_estimators=int(NUM_TREES),
                                  silent=False,
                                  scale_pos_weight=total_short_ratio)
    
    best_long_booster.fit(df_current_total_dataset_x, df_current_total_dataset_y_long.to_numpy())  
    
    best_long_booster_score = best_long_booster.score(df_current_total_dataset_x, df_current_total_dataset_y_long.to_numpy())

In [None]:
feature_importance = best_short_booster.get_booster().get_score(importance_type='weight')

features = []
for current_key in list(feature_importance.keys()):
    features.append({ 'feature': current_key, 'value' : feature_importance[current_key]})

features.sort(key=lambda x:x['value'], reverse=True)    
    
data = pd.DataFrame(features)
feature_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.hyperparameters_short.xlsx")
feature_full_file_name = os.path.join(DATA_OUTPUT_DIR,feature_file_name)
data.to_excel(feature_full_file_name, index=False)    

data.head(20)

In [None]:
feature_importance = best_long_booster.get_booster().get_score(importance_type='weight')

features = []
for current_key in list(feature_importance.keys()):
    features.append({ 'feature': current_key, 'value' : feature_importance[current_key]})

features.sort(key=lambda x:x['value'], reverse=True)    
    
data = pd.DataFrame(features)
feature_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.hyperparameters_long.xlsx")
feature_full_file_name = os.path.join(DATA_OUTPUT_DIR,feature_file_name)
data.to_excel(feature_full_file_name, index=False)    

data.head(20)

In [None]:
if SHAP_ENABLED:
    shap.initjs()
    explainer = shap.TreeExplainer(best_long_booster)
    shap_values = explainer.shap_values(df_current_total_dataset_x)
    shap.force_plot(explainer.expected_value, shap_values[0,:], df_current_total_dataset_x.iloc[0,:])

In [None]:
if SHAP_ENABLED:
    shap.initjs()
    shap_values = shap.TreeExplainer(best_long_booster).shap_values(df_current_total_dataset_x)
    shap.summary_plot(shap_values, df_current_total_dataset_x, plot_type="bar")

In [None]:
if SHAP_ENABLED:
    shap.initjs()
    explainer = shap.TreeExplainer(best_short_booster)
    shap_values = explainer.shap_values(df_current_total_dataset_x)
    shap.force_plot(explainer.expected_value, shap_values[0,:], df_current_total_dataset_x.iloc[0,:])

In [None]:
if SHAP_ENABLED:
    shap.initjs()
    shap_values = shap.TreeExplainer(best_short_booster).shap_values(df_current_total_dataset_x)
    shap.summary_plot(shap_values, df_current_total_dataset_x, plot_type="bar")

In [None]:
CURRENT_EXCHANGE = current_parameters['CURRENT_EXCHANGE']
CURRENT_ASSET = current_parameters['CURRENT_ASSET']
CURRENT_TIMEFRAME = current_parameters['CURRENT_TIMEFRAME']
CURRENT_TARGET = current_parameters['CURRENT_TARGET']
CURRENT_STOP = current_parameters['CURRENT_STOP']

In [None]:
def predict_short(row):
    a = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    return best_short_booster.get_booster().inplace_predict(a)[0]

def predict_long(row):
    a = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    return best_long_booster.get_booster().inplace_predict(a)[0]

df_current_total_dataset['short_predict'] = df_current_total_dataset.apply( lambda row: predict_short(row), axis=1)
df_current_total_dataset['long_predict'] = df_current_total_dataset.apply( lambda row: predict_long(row), axis=1)


In [None]:
current_total_dataset = None
dataset_matrix_short_test = None
dataset_matrix_long_test = None
dataset_matrix_short_train = None
dataset_matrix_long_train = None

df_current_test_dataset = None
current_test_dataset = None
df_current_test_dataset_x = None
df_current_test_dataset_y_short = None
df_current_test_dataset_y_long = None

df_current_train_dataset = None
current_train_dataset = None
df_current_train_dataset_x = None
df_current_train_dataset_y_short = None
df_current_train_dataset_y_long = None

gc.collect()

In [None]:
total_output_file_name = f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xlsx"
total_output_full_file_name = os.path.join(DATA_OUTPUT_DIR,total_output_file_name)

In [None]:
df_current_total_dataset = None
gc.collect()

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostshortmodel.txt")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

best_short_booster.get_booster().dump_model(model_full_file_name)

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostlongmodel.txt")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

best_long_booster.get_booster().dump_model(model_full_file_name)

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostshortmodel.pickle")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

joblib.dump(best_short_booster, model_full_file_name, True)

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostlongmodel.pickle")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

joblib.dump(best_long_booster, model_full_file_name, True)

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostlongmodel.pickle")

In [None]:
long_score = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.long.train.score.txt")
long_score_file_name = os.path.join(DATA_OUTPUT_DIR,long_score)

short_score = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.short.train.score.txt")
short_score_file_name = os.path.join(DATA_OUTPUT_DIR,short_score)

with open(long_score_file_name,"w") as f:
    f.write(str(best_long_booster_score))

with open(short_score_file_name,"w") as f:
    f.write(str(best_short_booster_score))

In [None]:
if IS_HYPER_PARAMETER_SEARCH:
    model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostclassifiershortmodel.pickle")
    model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

    joblib.dump(grid_search_short.best_estimator_, model_full_file_name, True)

In [None]:
if IS_HYPER_PARAMETER_SEARCH:
    model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostclassifierlongmodel.pickle")
    model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

    joblib.dump(grid_search_long.best_estimator_, model_full_file_name, True)