## Model : marketdirection
### Description :
This model uses a Tensorflow neural network to predict the direction of a market in the next Y periods, based on the values of the previous X periods. 

### Model Attributes :
- FFNN
- Boosting
- Re-training of entire network for each additional period

### USP :
- Normalised market data (between 0 and 1) to highlight common patterns at any time scale.
- Utilises similar markets to increase size of training set


In [18]:
MODEL_ID = "28da8212-32f3-4680-bacd-0cd1c51bfe02"

DATASET_ID1 = "4234f0f1b6fcc17f6458696a6cdf5101" # DOW
DATASET_ID2 = "3231bbe5eb2ab84eb54c9b64a8dcea55" # SPY

TRAINING_RUN = {
        "model_id": MODEL_ID,
        "datasets": [
            DATASET_ID1,
            DATASET_ID2
        ]
    }


In [19]:
#
# Get dataset from MI API #
#

import pandas
import sys
import gc
import uuid
import numpy as np

#!pip install --upgrade git+https://github.com/cwilko/quantutils.git

import quantutils.dataset.pipeline as ppl
from quantutils.api.auth import CredentialsStore
from quantutils.api.bluemix import CloudObjectStore, ObjectStore, Metrics, Logger
from quantutils.api.marketinsights import MarketInsights
from quantutils.api.assembly import MIAssembly
from quantutils.api.functions import Functions
import quantutils.model.utils as mlutils
from quantutils.model.ml import Model

cred = CredentialsStore()
metrics = Metrics(cred)
mi = MarketInsights(cred)
objStore = ObjectStore(cred)
cos = CloudObjectStore(cred)
log = Logger('MarketInsights-ML', cred)
fun = Functions(cred)
miassembly = MIAssembly(mi, fun)

# Logging helper function
tag = lambda x,y : "".join(["(", x, ":", str(y+1), ") "])

CONFIG = mi.get_model(MODEL_ID)
TRN_CNF = CONFIG['training']
TRAINING_RUN["id"] = cos.generateKey([str(TRAINING_RUN["datasets"]), str(TRAINING_RUN["model_id"])])
COS_BUCKET = "marketinsights-weights"
mi.put_training_run(TRAINING_RUN)

mkt1, mkt1_desc = mi.get_dataset_by_id(DATASET_ID1)
mkt2, mkt2_desc = mi.get_dataset_by_id(DATASET_ID2)

# Interleave (part of the "added insight" for this model)
MK1, MK2, isect = ppl.intersect(mkt1,mkt2)
dataset = ppl.interleave(MK1,MK2)

# TODO : This should be config, probably as a proportion
TEST_SET_SIZE = 432
TRAINING_SET_SIZE = len(dataset) - TEST_SET_SIZE
WINDOW_SIZE = TRAINING_SET_SIZE

_, test_y = ppl.splitCol(dataset[TRAINING_SET_SIZE:], mkt1_desc["features"])

# Create ML model
ffnn = Model(mkt1_desc["features"], mkt1_desc["labels"], CONFIG)

print("Done - Training ID: " + TRAINING_RUN["id"])

Done - Training ID: 83c0ff9eeb24c87dbb0af766ab59ef75


In [None]:
##
## BOOTSTRAP/BOOSTING TRAINING WITH LOO
##

# Train thread id
train_id = str(uuid.uuid1())[:8]
#train_id = "0b4045ec"

log.info("".join(["(", train_id, ")", " Training model: ", CONFIG['model_desc'], "(",MODEL_ID,") , Training Run: ", TRAINING_RUN["id"]]))
                  
try:
    
    testSetIndex = isect[-(TEST_SET_SIZE//2):]
    predictions = np.array([]).reshape(0,mkt1_desc["labels"])

    if (cos.keyExists(COS_BUCKET, TRAINING_RUN["id"])):
        weights = cos.get_csv(COS_BUCKET, TRAINING_RUN["id"])
        existing_predictions = pandas.DatetimeIndex(np.unique(weights["timestamp"]) * 10**9).tz_localize("UTC")
        resultIndex = testSetIndex.difference(existing_predictions)
    else:
        weights = pandas.DataFrame()
        resultIndex = testSetIndex
        
    prediction_idx = np.array([testSetIndex.get_loc(idx) for idx in resultIndex])     

    labels_idx = ppl.interleave(pandas.DataFrame(prediction_idx*2), pandas.DataFrame(prediction_idx*2+1)).values.flatten()

    for i in prediction_idx:
        print("Training", end='')
        dataIdx = i * 2 + TRAINING_SET_SIZE
        training_set = dataset[dataIdx-WINDOW_SIZE:dataIdx]
        test_set = dataset[dataIdx:dataIdx+2]
        success = False
        prediction = [-1, -1]
        retry = 0
        while ((not success) & (retry<TRN_CNF['training_retries'])):
            try:
                ## CHOOSE BOOTSTRAP OR BOOST
                results = mlutils.boostingTrain(ffnn, training_set, test_set, TRN_CNF['lamda'], TRN_CNF['iterations'], CONFIG['debug'])
                #results = mlutils.bootstrapTrain(ffnn, training_set, test_set, TRN_CNF['lamda'], TRN_CNF['iterations'], TRN_CNF['threshold'], CONFIG['debug'])
                prediction = np.nanmean(results["test_predictions"], axis=0)
                predictions =  np.concatenate([predictions, prediction])    
                success = True
            except ValueError: 
                print("Value error")
                log.error("".join([tag(train_id, i), "ValueError - Retrying..."]))
                retry = retry + 1
        
        if (not success):
            log.error("Failed to train after several retries")
            break
            
        print(".")

        # Extract predictions and store them (deprecated)
        p1, p2 = [pandas.DataFrame([mkt], index=testSetIndex[i:i+1]) for mkt in prediction]        
        #mi.put_predictions(p1, DATASET_ID1, TRAINING_RUN["id"], update=True)
        #mi.put_predictions(p2, DATASET_ID1, TRAINING_RUN["id"], update=True)
        
        # Extract weights and store them        
        newWeights = pandas.DataFrame(results["weights"])
        newWeights.insert(0,'timestamp', [testSetIndex[i].value // 10**9] * len(newWeights))
        if (len(weights.columns)>0):
            weights.columns = newWeights.columns
        weights = weights.append(newWeights)
        print("Storing Weights...")
        cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], weights) # Re-Write entire csv (TODO : to parquet)
        
        if (True):
            log.debug("".join([tag(train_id, i), testSetIndex[i].isoformat(), " ", DATASET_ID1, ": ", str(p1.values[0])]))
            log.debug("".join([tag(train_id, i), testSetIndex[i].isoformat(), " ", DATASET_ID2, ": ", str(p2.values[0])]))
            
        # Progress statistics
        res = mlutils.evaluate(ppl.onehot(predictions), ppl.onehot(test_y[labels_idx][:len(predictions)]), TRN_CNF['threshold'])
        log.info("".join([tag(train_id, i), str("Results after %d iterations, %.2f precision, %.2f recall at %.2f threshold" % (i+1, res[0], res[1], TRN_CNF['threshold']))]))   
        metrics.send([{'name':'MI.precision', 'value':res[0].tolist()},{'name':'MI.recall', 'value':res[1].tolist()}])

        # Backup predictions to filestore (deprecated)
        x = 1
        for mkt in ppl.deinterleave(pandas.DataFrame(predictions)):
            mkt.index = resultIndex[:len(mkt)]
            mkt.to_csv("results.csv", header=False)
            objStore.put_file('Experiment2', "results.csv", "".join([TRAINING_RUN["id"], "_", str(x), ".csv"]) )
            x = x + 1

        # Try to free memory
        gc.collect()
except:
    log.error("".join([tag(train_id, i), str("Unexpected error: %s" % sys.exc_info()[0])]))
    raise
    

In [20]:
##
## BOOSTING TRAINING
##
print("Training")
results = mlutils.boostingTrain(ffnn, dataset[:TRAINING_SET_SIZE], dataset[TRAINING_SET_SIZE:], TRN_CNF['lamda'], TRN_CNF['iterations'], CONFIG['debug'])
predictions =  np.nanmean(results["test_predictions"], axis=0)
print(mlutils.evaluate(ppl.onehot(predictions), ppl.onehot(test_y), .0))

# Save weights to Cloud Object Store
newWeights = pandas.DataFrame(results["weights"])
newWeights.insert(0,'timestamp', [isect[TRAINING_SET_SIZE//2].value // 10**9] * len(newWeights))
cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], newWeights)

Training
............................................................(0.5300926, 1.0, 0.69288956504253452)


In [17]:
##
## BOOTSTRAP TRAINING
##

print("Training")
results = mlutils.bootstrapTrain(ffnn, dataset[:TRAINING_SET_SIZE], dataset[TRAINING_SET_SIZE:], TRN_CNF['lamda'], TRN_CNF['iterations'], TRN_CNF['threshold'], CONFIG['debug'])
predictions =  np.nanmean(results["test_predictions"], axis=0)
print(mlutils.evaluate(ppl.onehot(predictions), ppl.onehot(test_y), .0))

# Save weights to Cloud Object Store
newWeights = pandas.DataFrame(results["weights"])
newWeights.insert(0,'timestamp', [isect[TRAINING_SET_SIZE//2].value // 10**9] * len(newWeights))
cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], newWeights)

Training
............................................................(0.52083331, 1.0, 0.68493148966921835)


In [44]:
def evaluate(predictions, data_y):
    
    a = np.argmax(predictions,axis=1)
    b = np.argmax(data_y,axis=1) 
    num = np.float32(np.sum(a == b))
    den = np.float32(b.shape[0])
    print("Won : " + str(num) )
    print("Lost : " + str(den - num))
    print("Diff : " + str(num-(den-num)))
    print("Edge : " + str(100*(num-(den-num))/den) +"%")
        
    return num / den

In [37]:
scores = miassembly.get_predictions_with_dataset_id(DATASET_ID2, TRAINING_RUN["id"], start="2016-07-06")
scores = ppl.intersect(scores, MK2)[0]
print(mlutils.evaluate(ppl.onehot(scores.values), ppl.onehot(ppl.splitCol(MK2["2016-07-06":], mkt1_desc["features"])[1]), .0))

(0.50925928, 1.0, 0.67484664708805508)


In [26]:
# Assess the individual market performance, TODO : Compare results with locally calculated performance
scores1 = miassembly.get_predictions_with_dataset_id(DATASET_ID1, TRAINING_RUN["id"], start="2016-07-06")
scores1 = ppl.intersect(scores1, MK1)[0]
print(mlutils.evaluate(ppl.onehot(scores1.values), ppl.onehot(ppl.splitCol(MK1["2016-07-06":], mkt1_desc["features"])[1]), .0))

(0.55092591, 1.0, 0.71044774834523872)


In [45]:
evaluate(ppl.onehot(scores.values), ppl.onehot(ppl.splitCol(MK2["2016-07-06":], mkt1_desc["features"])[1]))

Won : 110.0
Lost : 106.0
Diff : 4.0
Edge : 1.85185185185%


0.50925928

In [46]:
evaluate(ppl.onehot(scores1.values), ppl.onehot(ppl.splitCol(MK1["2016-07-06":], mkt1_desc["features"])[1]))

Won : 119.0
Lost : 97.0
Diff : 22.0
Edge : 10.1851851852%


0.55092591

In [47]:
evaluate(ppl.onehot(predictions), ppl.onehot(test_y))

Won : 229.0
Lost : 203.0
Diff : 26.0
Edge : 6.01851851852%


0.5300926

In [5]:
cos.delete(COS_BUCKET, TRAINING_RUN["id"])

In [134]:
cos.put_csv(COS_BUCKET, TRAINING_RUN["id"], newWeights)

In [11]:
cos.get_csv(COS_BUCKET, TRAINING_RUN["id"])

Unnamed: 0,timestamp,0,1,2,3,4,5,6,7,8,...,311,312,313,314,315,316,317,318,319,320
0,1467831600,3.312466,10.847818,-7.284418,12.761642,-1.591486,-12.120102,2.441128,5.913448,-5.073502,...,-18.903788,7.679415,13.185543,9.852283,-13.333435,-13.124993,17.278887,13.236541,-15.443013,1.400760
1,1467831600,-12.883047,-4.462502,11.792727,-3.287359,10.205512,5.590566,2.353325,0.141848,-14.445578,...,-19.162395,-17.597233,13.200068,-17.779320,11.068201,18.850004,15.635494,-16.071606,9.952607,16.906479
2,1467831600,3.133189,-0.116194,13.688684,7.583829,6.626789,-12.117444,10.507116,6.906179,-1.270432,...,14.541806,9.086310,-12.401361,22.728838,-9.483019,-20.009363,11.686844,9.689362,-11.075061,-9.469032
3,1467831600,-0.119519,-5.850684,9.357331,16.522095,-5.597649,-5.785295,-3.874425,6.556691,-1.583138,...,24.581789,-8.012128,17.507839,27.246597,16.734694,-22.078583,-13.090500,23.303617,21.534615,18.069738
4,1467831600,0.259647,3.096508,5.945527,3.997061,-9.602504,-3.347848,6.683752,-2.203965,6.507709,...,-22.295023,18.210531,-12.539247,-23.902081,-12.295306,24.981703,16.492365,11.920698,-17.567837,8.410412
5,1467831600,5.184369,-16.216311,-8.933708,8.512650,0.264156,25.821863,0.836845,4.690401,-5.427589,...,12.375577,21.356722,-15.936946,21.181139,9.876498,14.785489,-17.155067,-12.108348,10.530119,6.343624
6,1467831600,-20.036074,2.425073,5.072659,1.621127,10.662550,-16.336599,15.554777,-5.905240,9.782196,...,12.164492,-15.399787,-24.025648,15.092271,20.198137,-26.724188,8.491100,14.125527,21.393322,15.485247
7,1467831600,3.914138,-9.469058,10.307562,-10.121191,-0.854137,-4.355204,4.122284,13.458215,9.064775,...,16.428587,11.435040,-14.036438,-11.942908,15.470530,15.352645,-13.972067,-24.157564,8.110953,-8.149076
8,1467831600,-4.129739,3.443668,11.510314,-9.609694,-0.472901,-11.473576,9.997120,13.616072,-6.046878,...,27.120955,-20.533623,12.400377,-14.992278,-25.248348,14.748981,-6.994195,21.829988,12.213026,-7.276963
9,1467831600,17.870043,2.928711,-2.062353,-4.627346,-5.461296,-17.829369,-1.027926,-0.912719,12.150795,...,18.273010,-14.949856,18.455843,-20.901192,15.588398,11.600235,-13.380913,16.971762,-17.876619,-22.729334
