In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import gc

from sklearn.metrics import accuracy_score,precision_score,recall_score, confusion_matrix

In [None]:
data_folder = os.path.join('..','00.data','output')

files_found = [x for x in os.listdir(data_folder) if x.endswith('.pickle')]
files_found_tokens=[x.split('.') for x in files_found]

models_found = {}

for current_model in files_found_tokens:
    model_name = '.'.join(current_model[:-2])
    filename = os.path.join('..','00.data','output','.'.join(current_model))
    if not model_name in models_found:
        models_found[model_name]= {}          
    models_found[model_name][current_model[-2]]=filename

In [None]:
first_model = list(models_found.keys())[0]
current_train_dataset = joblib.load(models_found[first_model]['train'])
current_test_dataset = joblib.load(models_found[first_model]['test'])
current_total_dataset = joblib.load(models_found[first_model]['total'])
current_parameters = joblib.load(models_found[first_model]['parameters'])

In [None]:
df_current_train_dataset = pd.DataFrame(current_train_dataset)
df_current_train_dataset_x = df_current_train_dataset[current_parameters['CURRENT_X_COLUMNS']]
df_current_train_dataset_y_short = df_current_train_dataset[current_parameters['CURRENT_Y_COLUMN_SHORT']]
df_current_train_dataset_y_long = df_current_train_dataset[current_parameters['CURRENT_Y_COLUMN_LONG']]

train_total_count = len(df_current_train_dataset_x)
train_short_count = len([x for x in df_current_train_dataset_y_short if x == True])
train_long_count = len([x for x in df_current_train_dataset_y_long if x == True])
train_long_ratio = (train_total_count - train_long_count) / train_long_count
train_short_ratio = (train_total_count - train_short_count) / train_short_count

print(f'(train)Short Count:{train_short_count}/{train_total_count} {(train_short_count/train_total_count) * 100:.2f}%')
print(f'(train)Long Count:{train_long_count}/{train_total_count}  {(train_long_count/train_total_count) * 100:.2f}%')
print(f'(train)Long Ratio:{train_long_ratio:.2f}')
print(f'(train)Short Ratio:{train_short_ratio:.2f}')

In [None]:
df_current_test_dataset = pd.DataFrame(current_test_dataset)
df_current_test_dataset_x = df_current_test_dataset[current_parameters['CURRENT_X_COLUMNS']]
df_current_test_dataset_y_short = df_current_test_dataset[current_parameters['CURRENT_Y_COLUMN_SHORT']]
df_current_test_dataset_y_long = df_current_test_dataset[current_parameters['CURRENT_Y_COLUMN_LONG']]

test_total_count = len(df_current_test_dataset_x)
test_short_count = len([x for x in df_current_test_dataset_y_short if x == True])
test_long_count = len([x for x in df_current_test_dataset_y_long if x == True])

print(f'(test)Short Count:{test_short_count}/{test_total_count} {(test_short_count/test_total_count) * 100:.2f}%')
print(f'(test)Long Count:{test_long_count}/{test_total_count}  {(test_long_count/test_total_count) * 100:.2f}%')

In [None]:
df_current_train_dataset_x_labels=df_current_train_dataset_x.columns.values
dataset_matrix_short_train = xgb.DMatrix(data=df_current_train_dataset_x,label=df_current_train_dataset_y_short, feature_names=df_current_train_dataset_x_labels)
dataset_matrix_long_train = xgb.DMatrix(data=df_current_train_dataset_x,label=df_current_train_dataset_y_long, feature_names=df_current_train_dataset_x_labels)

In [None]:
df_current_test_dataset_x_labels=df_current_test_dataset_x.columns.values
dataset_matrix_short_test = xgb.DMatrix(data=df_current_test_dataset_x,label=df_current_test_dataset_y_short, feature_names=df_current_test_dataset_x_labels)
dataset_matrix_long_test = xgb.DMatrix(data=df_current_test_dataset_x,label=df_current_test_dataset_y_long, feature_names=df_current_test_dataset_x_labels)

In [None]:
gpu_res = {}
param = {}
param['eta'] = 0.3
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['tree_method'] = 'exact'
param['verbosity'] = 0
param['max_depth'] = 5

best_recall = 0.0
best_booster = None
best_num_rounds = 0
best_accuracy = 0
best_precision = 0
best_recall = 0

for num_round in [100]:
    booster = xgb.train(param, dataset_matrix_short_train, num_round, evals=[], evals_result=gpu_res)
    train_y_pred = booster.predict(dataset_matrix_short_train)
    train_predictions = np.array([value for value in train_y_pred])
    accuracy = accuracy_score(df_current_train_dataset_y_short, train_predictions.round())
    precision = precision_score(df_current_train_dataset_y_short, train_predictions.round())
    recall = recall_score(df_current_train_dataset_y_short, train_predictions.round())
    if recall > best_recall:
        best_booster = booster
        best_recall = recall
        best_num_rounds = num_round
        best_accuracy = accuracy
        best_precision = precision
        best_recall = recall
        print(f'Train - Numbers of rounds:{best_num_rounds}')
        print("(Base Train)Accuracy Total:{}".format(best_accuracy))
        print("(Base Train)Precision:{}".format(best_precision))
        print("(Base Train)Recall:{}".format(best_recall))
        test_y_pred = best_booster.predict(dataset_matrix_short_test)
        test_predictions = np.array([value for value in test_y_pred])
        accuracy = accuracy_score(df_current_test_dataset_y_short, test_predictions.round())
        precision = precision_score(df_current_test_dataset_y_short, test_predictions.round())
        recall = recall_score(df_current_test_dataset_y_short, test_predictions.round())
        print("Test...")
        print("(Base Test)Accuracy Total:{}".format(accuracy))
        print("(Base Test)Precision:{}".format(precision))
        print("(Base Test)Recall:{}".format(recall))        
    if best_recall > 0.8 and True:
        break        

In [None]:
feature_important = best_booster.get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.head(20)

In [None]:
CURRENT_ASSET = current_parameters['CURRENT_ASSET']
CURRENT_TIMEFRAME = current_parameters['CURRENT_TIMEFRAME']
CURRENT_TARGET = current_parameters['CURRENT_TARGET']
CURRENT_STOP = current_parameters['CURRENT_STOP']

In [None]:
gc.collect()
df_current_total_dataset = pd.DataFrame(current_total_dataset)

def predict_short(row):
    a = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    return best_booster.inplace_predict(a)[0]

total_output_file_name = f"{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xlsx"
total_output_full_file_name = os.path.join('..','00.data','output',total_output_file_name)

df_current_total_dataset['short_predict'] = df_current_total_dataset.apply( lambda row: predict_short(row), axis=1)
df_current_total_dataset.to_excel(total_output_full_file_name)

In [None]:
model_file_name = os.path.join(f"{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.{best_num_rounds}.xgboostmodel.txt")
model_full_file_name = os.path.join('..','00.data','output',model_file_name)

booster.dump_model(model_full_file_name)