## Model : marketdirection
### Description :
This model uses a Tensorflow neural network to predict the direction of a market in the next Y periods, based on the values of the previous X periods. 

### Model Attributes :
- FFNN
- Boosting
- Re-training of entire network for each additional period

### USP :
- Normalised market data (between 0 and 1) to highlight common patterns at any time scale.
- Utilises similar markets to increase size of training set


In [4]:
MODEL_ID = "fdbe5895-0327-49d9-83e9-2246dbe1858b"

DATASET_ID1 = "4234f0f1b6fcc17f6458696a6cdf5101" # DOW
DATASET_ID2 = "3231bbe5eb2ab84eb54c9b64a8dcea55" # SPY

TRAINING_RUN = {
        "model_id": MODEL_ID,
        "datasets": [
            DATASET_ID1,
            DATASET_ID2
        ]
    }


In [5]:
#
# Get dataset from MI API #
#

import pandas
import sys
import gc
import uuid
import numpy as np

#!pip install --upgrade git+https://github.com/cwilko/quantutils.git

import quantutils.dataset.pipeline as ppl
from quantutils.api.auth import CredentialsStore
from quantutils.api.bluemix import CloudObjectStore, ObjectStore, Metrics, Logger
from quantutils.api.marketinsights import MarketInsights
from quantutils.api.assembly import MIAssembly
from quantutils.api.functions import Functions
import quantutils.model.utils as mlutils
from quantutils.model.ml import Model

cred = CredentialsStore()
metrics = Metrics(cred)
mi = MarketInsights(cred)
objStore = ObjectStore(cred)
cos = CloudObjectStore(cred)
log = Logger('MarketInsights-ML', cred)
fun = Functions(cred)
miassembly = MIAssembly(mi, fun)

# Logging helper function
tag = lambda x,y : "".join(["(", x, ":", str(y+1), ") "])

CONFIG = mi.get_model(MODEL_ID)
TRN_CNF = CONFIG['training']
TRAINING_RUN["id"] = cos.generateKey([str(TRAINING_RUN["datasets"]), str(TRAINING_RUN["model_id"])])
COS_BUCKET = "marketinsights-weights"
mi.put_training_run(TRAINING_RUN)

mkt1, mkt1_desc = mi.get_dataset_by_id(DATASET_ID1)
mkt2, mkt2_desc = mi.get_dataset_by_id(DATASET_ID2)

# Crop training dates
if "training_end_date" in TRN_CNF:
    print("Cropping datasets...")
    #mkt1 = mkt1[TRN_CNF["training_start_date"]:TRN_CNF["training_end_date"]]
    #mkt2 = mkt2[TRN_CNF["training_start_date"]:TRN_CNF["training_end_date"]]

# Interleave (part of the "added insight" for this model)
MK1, MK2, isect = ppl.intersect(mkt1,mkt2)
dataset = ppl.interleave(MK1,MK2)

TRAINING_SET_SIZE = TRN_CNF["training_window_size"]
TEST_SET_SIZE = len(dataset) - TRAINING_SET_SIZE
WINDOW_SIZE = TRAINING_SET_SIZE

_, test_y = ppl.splitCol(dataset[TRAINING_SET_SIZE:], mkt1_desc["features"])

# Create ML model
ffnn = Model(mkt1_desc["features"], mkt1_desc["labels"], CONFIG)

print("Done - Training ID: " + TRAINING_RUN["id"])

Cropping datasets...
Done - Training ID: 078df5a1afbaa2290ee93b4a562e3898


In [None]:
##
## BOOTSTRAP/BOOSTING TRAINING WITH LOO
##

# Train thread id
train_id = str(uuid.uuid1())[:8]
#train_id = "0b4045ec"

log.info("".join(["(", train_id, ")", " Training model: ", CONFIG['model_desc'], "(",MODEL_ID,") , Training Run: ", TRAINING_RUN["id"]]))
                  
try:
    
    testSetIndex = isect[-(TEST_SET_SIZE//2):]
    predictions = np.array([]).reshape(0,mkt1_desc["labels"])

    if (cos.keyExists(COS_BUCKET, TRAINING_RUN["id"])):
        weights = cos.get_csv(COS_BUCKET, TRAINING_RUN["id"])
        existing_predictions = pandas.DatetimeIndex(np.unique(weights["timestamp"]) * 10**9).tz_localize("UTC")
        resultIndex = testSetIndex.difference(existing_predictions)
    else:
        weights = pandas.DataFrame()
        resultIndex = testSetIndex
        
    prediction_idx = np.array([testSetIndex.get_loc(idx) for idx in resultIndex])     

    labels_idx = ppl.interleave(pandas.DataFrame(prediction_idx*2), pandas.DataFrame(prediction_idx*2+1)).values.flatten()

    for i in prediction_idx:
        print("Training", end='')
        dataIdx = i * 2 + TRAINING_SET_SIZE
        training_set = dataset[dataIdx-WINDOW_SIZE:dataIdx]
        test_set = dataset[dataIdx:dataIdx+2]
        success = False
        prediction = [-1, -1]
        retry = 0
        while ((not success) & (retry<TRN_CNF['training_retries'])):
            try:
                ## CHOOSE BOOTSTRAP OR BOOST
                # TODO : Separate the train from the evaluation of the test set,i.e. go back into the model to evaluate the test set on the current weights. This is to be certain
                # that the test set is not affecting the training process.
                #results = mlutils.boostingTrain(ffnn, training_set, test_set, TRN_CNF['lamda'], TRN_CNF['iterations'], CONFIG['debug'])
                results = mlutils.bootstrapTrain(ffnn, training_set, test_set, TRN_CNF['lamda'], TRN_CNF['iterations'], TRN_CNF['threshold'], CONFIG['debug'])
                prediction = np.nanmean(results["test_predictions"], axis=0) # TODO Plug in other aggregation method, e.g. voting
                predictions =  np.concatenate([predictions, prediction])    
                success = True
            except ValueError: 
                print("Value error")
                log.error("".join([tag(train_id, i), "ValueError - Retrying..."]))
                retry = retry + 1
        
        if (not success):
            log.error("Failed to train after several retries")
            break
            
        print(".")

        # Extract predictions and store them (deprecated)
        p1, p2 = [pandas.DataFrame([mkt], index=testSetIndex[i:i+1]) for mkt in prediction]        
        #mi.put_predictions(p1, DATASET_ID1, TRAINING_RUN["id"], update=True)
        #mi.put_predictions(p2, DATASET_ID1, TRAINING_RUN["id"], update=True)
        
        # Extract weights and store them        
        newWeights = pandas.DataFrame(results["weights"])
        newWeights.insert(0,'timestamp', [testSetIndex[i].value // 10**9] * len(newWeights))
        if (len(weights.columns)>0):
            weights.columns = newWeights.columns
        weights = weights.append(newWeights)
        print("Storing Weights...")
        cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], weights) # Re-Write entire csv (TODO : to parquet)
        
        if (True):
            log.debug("".join([tag(train_id, i), testSetIndex[i].isoformat(), " ", DATASET_ID1, ": ", str(p1.values[0])]))
            log.debug("".join([tag(train_id, i), testSetIndex[i].isoformat(), " ", DATASET_ID2, ": ", str(p2.values[0])]))
            
        # Progress statistics
        res = mlutils.evaluate(ppl.onehot(predictions), ppl.onehot(test_y[labels_idx][:len(predictions)]), TRN_CNF['threshold'])
        log.info("".join([tag(train_id, i), str("Results after %d iterations, %.2f precision, %.2f recall at %.2f threshold" % (i+1, res[0], res[1], TRN_CNF['threshold']))]))   
        metrics.send([{'name':'MI.precision', 'value':res[0].tolist()},{'name':'MI.recall', 'value':res[1].tolist()}])

        # Backup predictions to filestore (deprecated)
        x = 1
        for mkt in ppl.deinterleave(pandas.DataFrame(predictions)):
            mkt.index = resultIndex[:len(mkt)]
            mkt.to_csv("results.csv", header=False)
            objStore.put_file('Experiment2', "results.csv", "".join([TRAINING_RUN["id"], "_", str(x), ".csv"]) )
            x = x + 1

        # Try to free memory
        gc.collect()
except:
    log.error("".join([tag(train_id, i), str("Unexpected error: %s" % sys.exc_info()[0])]))
    raise
    

In [None]:
##
## BOOSTING TRAINING
##
print("Training")
results = mlutils.boostingTrain(ffnn, dataset[:TRAINING_SET_SIZE], dataset[TRAINING_SET_SIZE:], TRN_CNF['lamda'], TRN_CNF['iterations'], CONFIG['debug'])
predictions =  np.nanmean(results["test_predictions"], axis=0)
print(mlutils.evaluate(ppl.onehot(predictions), ppl.onehot(test_y), .0))

# Save weights to Cloud Object Store
newWeights = pandas.DataFrame(results["weights"])
newWeights.insert(0,'timestamp', [isect[TRAINING_SET_SIZE//2].value // 10**9] * len(newWeights))
cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], newWeights)

In [3]:
##
## BOOTSTRAP TRAINING
##

print("Training")
results = mlutils.bootstrapTrain(ffnn, dataset[:TRAINING_SET_SIZE], dataset[TRAINING_SET_SIZE:], TRN_CNF['lamda'], TRN_CNF['iterations'], TRN_CNF['threshold'], CONFIG['debug'])
predictions =  np.nanmean(results["test_predictions"], axis=0)
print(mlutils.evaluate(ppl.onehot(predictions), ppl.onehot(test_y), .0))

# Save weights to Cloud Object Store
newWeights = pandas.DataFrame(results["weights"])
newWeights.insert(0,'timestamp', [isect[TRAINING_SET_SIZE//2].value // 10**9] * len(newWeights))
#cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], newWeights)

Training
............................................................(0.52083331, 1.0, 0.68493148966921835)


In [None]:
# Assess the individual market performance
scores1 = miassembly.get_predictions_with_dataset_id(DATASET_ID1, TRAINING_RUN["id"], start="2016-07-06")
scores1 = ppl.intersect(scores1, MK1)[0]

scores2 = miassembly.get_predictions_with_dataset_id(DATASET_ID2, TRAINING_RUN["id"], start="2016-07-06")
scores2 = ppl.intersect(scores2, MK2)[0]

In [15]:
a = mlutils.aggregatePredictions([scores1], method='mean_all')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK1.loc[a.index], mkt1_desc["features"])[1])))
a = mlutils.aggregatePredictions([scores2], method='mean_all')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK2.loc[a.index], mkt1_desc["features"])[1])))

#display(evaluate(ppl.onehot(predictions), ppl.onehot(test_y)))

Won : 265.0
Lost : 250.0
Total : 515.0
Diff : 15.0
Edge : 2.91262135922%
IR : 0.660979


0.51456308

Won : 273.0
Lost : 242.0
Total : 515.0
Diff : 31.0
Edge : 6.01941747573%
IR : 1.36602


0.53009707

In [17]:
a = mlutils.aggregatePredictions([scores1,scores2], method='mean_all')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK1.loc[a.index], mkt1_desc["features"])[1])))
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK2.loc[a.index], mkt1_desc["features"])[1])))

Won : 267.0
Lost : 248.0
Total : 515.0
Diff : 19.0
Edge : 3.68932038835%
IR : 0.83724


0.51844662

Won : 260.0
Lost : 255.0
Total : 515.0
Diff : 5.0
Edge : 0.970873786408%
IR : 0.220326


0.50485438

In [18]:
a = mlutils.aggregatePredictions([scores1,scores2], method='vote_majority')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK1.loc[a.index], mkt1_desc["features"])[1])))
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK2.loc[a.index], mkt1_desc["features"])[1])))

Won : 269.0
Lost : 246.0
Total : 515.0
Diff : 23.0
Edge : 4.46601941748%
IR : 1.0135


0.52233011

Won : 264.0
Lost : 251.0
Total : 515.0
Diff : 13.0
Edge : 2.52427184466%
IR : 0.572848


0.51262134

In [19]:
a = mlutils.aggregatePredictions([scores1,scores2], method='vote_unanimous_markets')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK1.loc[a.index], mkt1_desc["features"])[1])))
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK2.loc[a.index], mkt1_desc["features"])[1])))

Won : 261.0
Lost : 237.0
Total : 498.0
Diff : 24.0
Edge : 4.81927710843%
IR : 1.07547


0.52409637

Won : 255.0
Lost : 243.0
Total : 498.0
Diff : 12.0
Edge : 2.40963855422%
IR : 0.537733


0.51204818

In [20]:
a = mlutils.aggregatePredictions([scores1,scores2], method='vote_unanimous_pred')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK1.loc[a.index], mkt1_desc["features"])[1])))
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK2.loc[a.index], mkt1_desc["features"])[1])))

Won : 276.0
Lost : 239.0
Total : 515.0
Diff : 37.0
Edge : 7.18446601942%
IR : 1.63041


0.53592235

Won : 273.0
Lost : 242.0
Total : 515.0
Diff : 31.0
Edge : 6.01941747573%
IR : 1.36602


0.53009707

In [21]:
a = mlutils.aggregatePredictions([scores1,scores2], method='vote_unanimous_all')
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK1.loc[a.index], mkt1_desc["features"])[1])))
display(mlutils.evaluate(ppl.onehot(a.values), ppl.onehot(ppl.splitCol(MK2.loc[a.index], mkt1_desc["features"])[1])))

Won : 206.0
Lost : 173.0
Total : 379.0
Diff : 33.0
Edge : 8.70712401055%
IR : 1.6951


0.54353565

Won : 202.0
Lost : 177.0
Total : 379.0
Diff : 25.0
Edge : 6.5963060686%
IR : 1.28416


0.53298151

In [3]:
cos.delete(COS_BUCKET, TRAINING_RUN["id"])

In [None]:
cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], newWeights)

In [None]:
cos.get_csv(COS_BUCKET, TRAINING_RUN["id"])