In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Setup requirements
[here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-environment) is the guide to the development environment setup.

1. install [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/windows.html) (miniconda is fine) 
2. create a new env 

   `conda env create -f environment.yml`
   `conda activate aml`
   
3. setup jupyter

   `conda install notebook ipykernel`
   `ipython kernel install --user --name aml --display-name "aml"`
   
   
4. install azure ml libraries 

   `pip install azureml-core`

In [2]:
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth = InteractiveLoginAuthentication(tenant_id="dadbf9da-3f3b-44a8-8097-f3512ff34da8")
from azureml.core import Workspace
ws = Workspace.from_config(auth=interactive_auth)
print('Connected to Workspace',
  '-- name: ' + ws.name,
  '-- Azure region: ' + ws.location,
  '-- Resource group: ' + ws.resource_group,
  sep = '\n')
datastore = ws.get_default_datastore()

Connected to Workspace
-- name: fin-ws-wus2
-- Azure region: westus2
-- Resource group: fin-research


# Loading packages

In [6]:
import numpy as np
import os
import sys
import time
import pandas as pd 
from azureml.core.datastore import Datastore
from azureml.core.dataset import Dataset

# Loading datasets

In [4]:
train_dataset_x = 'train_dataset_x'
train_dataset_y = 'train_dataset_y'
train_dataset_w = 'train_dataset_w'

test_dataset_x = 'test_dataset_x'
test_dataset_y = 'test_dataset_y'
test_dataset_w = 'test_dataset_w'



In [10]:
ds_path = '../../data/datasets/'
ds_list = [train_dataset_x,train_dataset_y,train_dataset_w, test_dataset_x, test_dataset_y, test_dataset_w, ]
ds_dict = {}
for ds_name in ds_list:
    dataset = Dataset.get_by_name(ws, name=ds_name)
    r = dataset.download(target_path=ds_path, overwrite=True)
    ds_dict[ds_name]=r[0]

In [11]:
ds_dict

{'train_dataset_x': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/fi-d14/code/Users/ma.mahmoudzadeh/data/datasets/X_train_data.npy',
 'train_dataset_y': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/fi-d14/code/Users/ma.mahmoudzadeh/data/datasets/y_1_train_data.npy',
 'train_dataset_w': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/fi-d14/code/Users/ma.mahmoudzadeh/data/datasets/w_train_data.npy',
 'test_dataset_x': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/fi-d14/code/Users/ma.mahmoudzadeh/data/datasets/X_test_data.npy',
 'test_dataset_y': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/fi-d14/code/Users/ma.mahmoudzadeh/data/datasets/y_1_test_data.npy',
 'test_dataset_w': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/fi-d14/code/Users/ma.mahmoudzadeh/data/datasets/w_test_data.npy'}

# Simulation Functions

In [3]:
from enum import Enum
class BindState(Enum):
    CASH=1
    LONG=2
    SHORT=0
    
def single_unit_trade(y_pred, ret, hold_duration, bar_duration, labels=[0,1,2]):
    ret_sum = 0
    hold_time = 0
    bar_separation = 0
    number_of_trades=0
    state=BindState.CASH
    for i,pred in enumerate(y_pred):
        if state==BindState.LONG:
            bar_separation +=bar_duration[i]
            if bar_separation>= hold_time:
                state=BindState.CASH
                
        if state==BindState.CASH and pred==labels[2]:
                number_of_trades+=1
                state=BindState.LONG
                hold_time = hold_duration[i]
                ret_sum+=ret[i]
    return ret_sum, number_of_trades 

In [40]:
file_path_train_x = os.path.join('../../data/labels_with_weights/','X_train_data.npy')
file_path_train_y = os.path.join('../../data/labels_with_weights/','y_1_train_data.npy')
file_path_train_w = os.path.join('../../data/labels_with_weights/','w_train_data.npy')
file_path_test_x = os.path.join('../../data/labels_with_weights/','X_test_data.npy')
file_path_test_y = os.path.join('../../data/labels_with_weights/','y_1_test_data.npy')
file_path_test_w = os.path.join('../../data/labels_with_weights/','w_test_data.npy')


In [41]:
X_train = np.load(file_path_train_x)
y_train_all = np.load(file_path_train_y)
y_train = y_train_all[:,0].reshape(len(y_train_all))
y_train+=1
w_train = np.load(file_path_train_w)
w_train = w_train.reshape(len(w_train))


X_test = np.load(file_path_test_x)
y_test_all = np.load(file_path_test_y)
y_test = y_test_all[:,0].reshape(len(y_test_all))
ret_test = y_test_all[:,1].reshape(len(y_test_all))
hold_test = y_test_all[:,2].reshape(len(y_test_all))
y_test+=1
w_test = np.load(file_path_test_w)
w_test = w_test.reshape(len(w_test))

print(
    "shapes of X_train {}, y_train, {}, w_train {}, X_test {}, y_test {}, w_test {}".format(
        X_train.shape, y_train.shape, w_train.shape, X_test.shape, y_test.shape, w_test.shape
    )
)

shapes of X_train (826971, 62), y_train, (826971,), w_train (826971,), X_test (206743, 62), y_test (206743,), w_test (206743,)


In [42]:
time_steps = 10

In [43]:
X_train_3d, y_train_3d, w_train_2d = build_timeseries(X_train, y_train, steps = time_steps, weights=w_train) 
X_test_3d, y_test_3d, w_test_2d = build_timeseries(X_test, y_test, steps = time_steps, weights=w_test) 
ret_test_2d = ret_test[time_steps:]
w_train_2d = w_train[time_steps:]
w_test_2d = w_test[time_steps:]
hold_test_2d = hold_test[time_steps:]

dim_0 826961
length of time-series i/o (826961, 10, 62) (826961,)
dim_0 206733
length of time-series i/o (206733, 10, 62) (206733,)


In [44]:
X_train_2d = X_train_3d.reshape(X_train_3d.shape[0],X_train_3d.shape[1]*X_train_3d.shape[2])
X_test_2d = X_test_3d.reshape(X_test_3d.shape[0],X_test_3d.shape[1]*X_test_3d.shape[2])
y_train_2d = y_train_3d
y_test_2d = y_test_3d

print(
    "shapes of X_train_2d {}, y_train, {}, X_test_2d {}, y_test_2d {}".format(
        X_train_2d.shape, y_train_2d.shape, X_test_2d.shape, y_test_2d.shape
    )
)

shapes of X_train_2d (826961, 620), y_train, (826961,), X_test_2d (206733, 620), y_test_2d (206733,)


In [45]:
name = "LR"
model = LogisticRegression()
model.fit(X_train_2d, y_train_2d, sample_weight=w_train_2d)
y_pred_flat = model.predict(X_test_2d)
cm = confusion_matrix(y_test_2d, y_pred_flat)

print("{} gain precision: {:.2f}".format(name, cm[2,2]/sum(cm[:,2])))
print(classification_report(y_test_2d, y_pred_flat, target_names=['loss','no hit','gain']))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR gain precision: 0.41
              precision    recall  f1-score   support

        loss       0.53      0.85      0.65    107359
      no hit       0.18      0.30      0.23     18058
        gain       0.41      0.02      0.04     81316

    accuracy                           0.48    206733
   macro avg       0.37      0.39      0.31    206733
weighted avg       0.45      0.48      0.37    206733



In [50]:
single_unit_trade(y_pred_flat, ret_test_2d, hold_test_2d, w_test_2d, labels=[0,1,2]) 

(-1041.25, 4264)

In [51]:
train_data = lightgbm.Dataset(X_train_2d, label=y_train_2d, weight = w_train_2d)
# val_data = lightgbm.Dataset(X_val_2d, label=y_val, reference=train_data)
test_data = lightgbm.Dataset(X_test_2d, label=y_test_2d)#, weight = w_test_2d)

In [52]:
parameters = {
    'objective': 'multiclass',
#     'metric': 'multi_logloss,auc_mu',
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_class':3,
    'metric_freq': 1,
    'is_training_metric':'true',
    'learning_rate':0.05,
    'num_leaves': 5,
    'num_trees':100,
    'feature_fraction': 0.43,
    'bagging_fraction': 0.48,
    'bagging_freq': 4,
    'min_data_in_leaf':50,
    'learning_rate': 0.05,
    'verbose': 1,
    'early_stopping_round':10
}

In [53]:
print('Starting training...')
# train
gbm = lightgbm.train(params=parameters,
                     train_set = train_data,
                     valid_sets=[train_data,test_data])

Starting training...
[1]	training's multi_logloss: 1.02084	valid_1's multi_logloss: 0.97141
Training until validation scores don't improve for 10 rounds
[2]	training's multi_logloss: 1.01692	valid_1's multi_logloss: 0.970125
[3]	training's multi_logloss: 1.0133	valid_1's multi_logloss: 0.968926
[4]	training's multi_logloss: 1.00994	valid_1's multi_logloss: 0.967815
[5]	training's multi_logloss: 1.00688	valid_1's multi_logloss: 0.966767
[6]	training's multi_logloss: 1.00402	valid_1's multi_logloss: 0.966139
[7]	training's multi_logloss: 1.00125	valid_1's multi_logloss: 0.965594
[8]	training's multi_logloss: 0.998749	valid_1's multi_logloss: 0.964827
[9]	training's multi_logloss: 0.99641	valid_1's multi_logloss: 0.963975
[10]	training's multi_logloss: 0.994219	valid_1's multi_logloss: 0.963591
[11]	training's multi_logloss: 0.992062	valid_1's multi_logloss: 0.963248
[12]	training's multi_logloss: 0.990122	valid_1's multi_logloss: 0.962873
[13]	training's multi_logloss: 0.988188	valid_1's

In [54]:
y_pred = gbm.predict(X_test_2d)
y_pred_flat = np.argmax(y_pred, axis=1)
print(classification_report(y_test_2d, y_pred_flat, target_names=['loss','no hit','gain']))


              precision    recall  f1-score   support

        loss       0.52      0.98      0.68    107359
      no hit       0.36      0.01      0.02     18058
        gain       0.42      0.01      0.03     81316

    accuracy                           0.52    206733
   macro avg       0.43      0.34      0.24    206733
weighted avg       0.46      0.52      0.36    206733



In [37]:
single_unit_trade(y_pred_flat, ret_test_2d, hold_test_2d, w_test_2d, labels=[0,1,2]) 

(-129.0, 1535)

In [39]:
np.mean(w_test_2d)

16.571799253626658

In [None]:
w_test_2d.