In [None]:
from datetime import datetime, timedelta
import time
from collections import deque
import numpy as np
import tensorflow as tf
import random
from save_and_load import *
from Candle import Candle
from MultiTimeframeCandleManager import MultiTimeframeCandleManager

candles = obj_load("NQ_1")
len(candles)

In [None]:
gamma = 0.99
memory_len = 50000
ep_len = 1000

m1 = np.eye(2, dtype="float32")
num_model_inputs = 7


In [None]:
chart_m15 = tf.keras.layers.Input(shape = (60,4))
chart_m5 = tf.keras.layers.Input(shape = (60,4))
chart_m1 = tf.keras.layers.Input(shape = (60,4))

pdas = tf.keras.layers.Input(shape = (10,))

current_position = tf.keras.layers.Input(shape = (1,))
scaled_open_profit = tf.keras.layers.Input(shape = (1,))

minutes = tf.keras.layers.Input(shape = (1,))
minutes_embed = tf.keras.layers.Embedding(input_dim=60*24, output_dim=8)(minutes)
minutes_embed_flat = tf.keras.layers.Flatten()(minutes_embed)

f15 = tf.keras.layers.Flatten()(chart_m15)
f5 = tf.keras.layers.Flatten()(chart_m5)
f1 = tf.keras.layers.Flatten()(chart_m1)

c = tf.keras.layers.Concatenate()([f15, f5, f1, pdas, minutes_embed_flat, current_position, scaled_open_profit])

d = tf.keras.layers.Dense(1024, "relu")(c)
d = tf.keras.layers.Dense(1024, "relu")(d)
d = tf.keras.layers.Dense(1024, "relu")(d)

o = tf.keras.layers.Dense(2, "linear")(d)

model = tf.keras.Model(inputs = [chart_m15, chart_m5, chart_m1, pdas, minutes, current_position, scaled_open_profit], outputs = o)

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001)

In [None]:

def relative (value, center, r):
        return (value - center) / r
        
def ret_to_scaled_inputs(ret):
    
    midnight_open, midnight_opening_range_high,midnight_opening_range_low, pdas, current_close, current_time, charts = ret
    
    center = (midnight_opening_range_high + midnight_opening_range_low) / 2
    r = (midnight_opening_range_high - midnight_opening_range_low) / 2
    
    pda_rel = []
    pda_rel.append(relative(midnight_open, center, r))
    for pda in pdas:
        pda_rel.append(relative(pda, center, r))
    pda_np = np.array(pda_rel)
    
    current_minutes = current_time.hour * 60 + current_time.minute
    
    charts_array = []
    for candlesticks in charts:
        charts_array.append([])
        for candle in candlesticks:
            o = relative(candle.o, center, r)
            h = relative(candle.h, center, r)
            l = relative(candle.l, center, r)
            c = relative(candle.c, center, r)
            charts_array[-1].append([o,h,l,c])
    
    m15_np = np.array(charts_array[0])
    m5_np = np.array(charts_array[1])
    m1_np = np.array(charts_array[2])

    return [m15_np, m5_np, m1_np, pda_np, current_minutes]

In [None]:
m = MultiTimeframeCandleManager()

current_position = 0
entry_price = 0

equity = 0
equity_L = [0]

cmm = 0.5

last_close = 0
last_state = None
last_action = 0

index = 0

def step():

    global index, last_close, last_state, last_action, current_position, entry_price, equity


    sarts = None
    while  sarts == None:
        
        ret = m.push_m1_candle(candles[index])
        midnight_open, midnight_opening_range_high,midnight_opening_range_low, pdas, current_close, current_time, charts = ret
        center = (midnight_opening_range_high + midnight_opening_range_low) / 2
        r = (midnight_opening_range_high - midnight_opening_range_low) / 2
    
        
        if(len(m.m15_candles) == 60):

            open_profit = (current_close - entry_price) * current_position
            equity += open_profit
            scaled_open_profit = open_profit / r
            
            state = ret_to_scaled_inputs(ret) + [current_position, scaled_open_profit]
            m15_np, m5_np, m1_np, pda_np, current_minutes, pos, open_profit = state
            
            if(last_state != None):
                diff = equity - equity_L[-1]
                equity_L.append(equity)
                reward =  (diff) / r
                terminal = 0
                if(index+1 == len(candles)):
                    terminal = 1
                
                sarts = last_state, last_action, reward, terminal, state
                
                
            output = model([
                tf.expand_dims(m15_np, 0),
                tf.expand_dims(m5_np, 0),
                tf.expand_dims(m1_np, 0),
                tf.expand_dims(pda_np, 0),
                tf.expand_dims(current_minutes, 0),
                tf.expand_dims(pos, 0),
                tf.expand_dims(open_profit, 0),
            ])
            
            last_action = np.argmax(output)
            last_close = current_close
            last_state = state
    
    
            if(last_action == 0 and current_position != -1):
                current_position = -1
                entry_price = current_close
                equity -= cmm
            if(last_action == 1 and current_position != 1):
                current_position = 1
                entry_price = current_close
                equity -= cmm
    
               
    
        index += 1
        if(index == len(candles)):
            index = 0
            current_position = 0
            entry_price = 0
            last_close = 0
            last_state = None
            last_action = 0
            print("env reset")
            
    return sarts

In [None]:
@tf.function(reduce_retracing=True)
def get_target_q(next_states, rewards, terminals):
            estimated_q_values_next = model(next_states)
            q_batch = tf.math.reduce_max(estimated_q_values_next, axis=1)
            target_q_values = q_batch * gamma * (1-terminals) + rewards
            return target_q_values

@tf.function(reduce_retracing=True)
def tstep(states, masks, rewards, terminals, next_states):
    
    target_q_values = get_target_q(next_states, rewards, terminals)
    
    with tf.GradientTape() as t:
        model_return = model(states, training=True) 
        mask_return = model_return * masks
        estimated_q_values = tf.math.reduce_sum(mask_return, axis=1)
        #print(estimated_q_values, mask_return, model_return, masks)
        loss_e = tf.math.square(target_q_values - estimated_q_values)
        loss = tf.reduce_mean(loss_e)
    
    
    gradient = t.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    
    return loss, tf.reduce_mean(estimated_q_values)

In [None]:
def run():
    sarts = step()
    sarts_memory.append(sarts)

    sarts_sample = random.sample(sarts_memory, min(32, len(sarts_memory)))
    
    states = [x[0] for x in sarts_sample]
    actions = [x[1] for x in sarts_sample]
    rewards = np.array([x[2] for x in sarts_sample], dtype="float32")
    terminals = np.array([x[3] for x in sarts_sample], dtype="float32")
    next_states = [x[4] for x in sarts_sample]
    
    next_states_array = []
    for i in range(num_model_inputs):
        next_states_array.append(np.array([x[i] for x in next_states], dtype = "float32"))
    
    
    states_array = []
    for i in range(num_model_inputs):
        states_array.append(np.array([x[i] for x in states], dtype = "float32"))
    
    
    masks = np.array(m1[actions], dtype="float32")
    
    loss, q = tstep(states_array, masks, rewards, terminals, next_states_array)
    
    return loss, q, sarts[2], sarts[1]

In [None]:
sarts_memory = deque(maxlen = memory_len)

loss_mean = []
q_mean = []
rewards_mean = []

for _ in range(100):
    loss = []
    q = []
    rewards = []
    progbar = tf.keras.utils.Progbar(ep_len)
    for i in range(ep_len):
        c_loss, c_q, c_rewards, c_action = run()
        loss.append(c_loss)
        q.append(c_q)
        rewards.append(c_rewards)
        
        progbar.update(i+1, values = [("loss", c_loss), ("qv", c_q), ("reward", c_rewards), ("avg_action", c_action)])
    
    loss_mean.append(np.mean(loss))
    q_mean.append(np.mean(q))
    rewards_mean.append(np.mean(rewards))

In [None]:
import matplotlib.pyplot as plt
plt.plot(equity_L)
equity