In [None]:
# check requirements
# !pip install cmake 'gym[atari]' scipy

import gym 
from IPython.display import clear_output,display
from time import sleep
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import os

data_path = '../data_offline'

%matplotlib inline

In [None]:
env = gym.make("Taxi-v3").env 
env.reset()
env.render()
# rules: https://gym.openai.com/envs/Taxi-v3/

In [None]:
# random agent driving 
state , done = env.reset(), False

for i in range(10):
    
    # random action
    action = env.action_space.sample() 
    
    # get new state and reward
    newstate, reward, done, _ = env.step(action) 
    
    # output
    clear_output(wait=True)
    env.render()
    print('Experience:')
    print('state : %d' % state)
    print('action: %d' % action)
    print('reward: %d' % reward)
    print('next state: %d' % newstate)
    sleep(5)
    
    state = newstate

In [None]:
env = gym.make("Taxi-v3").env 
env.seed(34)

Q = np.zeros((env.observation_space.n,env.action_space.n))

epsilon = 0.1
gamma = 0.9
alpha = 0.1

for episode in range(10000):
    
    if (episode+1) % 500 == 0:
        print('episode %d' % (episode+1))
    
    state , done = env.reset(), False

    while not done:

        if np.random.uniform() < 0.1:
        
            action = env.action_space.sample()
        
        else:
            
            action = None # TODO
        
        
        # get new state and reward
        newstate, reward, done, _ = env.step(action) 

        # update
        y = reward + 0.9 * np.max(Q[newstate])
    
        Q[state,action] -= None # TODO

        state = newstate

In [None]:
# trained agent driving
env.seed(484) 
state , done = env.reset(), False
while not done:    
    # optimal action
    action = np.argmax(Q[state])
    
    # get new state and reward
    state, reward, done, _ = env.step(action)
    
    clear_output(wait=True)
    env.render()
    sleep(.5)

# Portfolio problem

In [None]:
df = pd.read_csv(data_path + '/mkt_rf_dp.csv')

# excess return
df['RetEx'] = df['Ret'] - df['Rfree']

# lag dp ratio by a year
df['D/P lag'] = df['D/P'].shift(12)

# merton myopic strategy 
roll = np.log(1+df['Ret']).shift(1).rolling(240)
mu, sigma2 = roll.mean(), roll.var()
df['merton'] = (mu + 0.5*sigma2 - np.log(1+df['Rfree'])) / sigma2

# binary signal
roll = df['D/P lag'].rolling(240)
df['signal'] = (df['D/P lag'] > roll.median()).astype('int')

In [None]:
df['signal'].plot()

In [None]:
sns.barplot(data=df,x='signal',y='RetEx',ci=None)

In [None]:
df.dropna().to_csv(data_path + '/merton_signal.csv')

In [None]:
import gym
import numpy as np
from gym import error, spaces, utils
from gym.utils import seeding
import pandas as pd
import os

class MertonLogSignal(gym.Env):

    def __init__(self,horizon=12):
        
        # economic parameters
        self.horizon = horizon 
        
        # markov problem
        self.observation_space=spaces.Discrete(2) 
        self.action_space=spaces.Discrete(3)
        
        # data
        self.data = pd.read_csv(data_path + '/merton_signal.csv')  
        

    def step(self, action):
        
        # portfolio
        self.merton = self.data['merton'].iloc[self.date]
        self.port =  self.merton + (action-1) * 0.5
        
        # returns next period
        Rf = self.data['Rfree'].iloc[self.date]
        Re = self.data['RetEx'].iloc[self.date]

        # log return
        self.rp = np.log(1 + Rf + self.port * Re)
        
        # housekeeping
        self.date += 1
        self.life -= 1
        self.dp = self.data['signal'].iloc[self.date]
        
        # output: state, reward, done, info
        return self.dp, self.rp, self.life == 0, {}
        

    def reset(self):
        # wealth
        self.wealth = 100
        
        # time
        self.date = np.random.choice(len(self.data)-self.horizon) # birthday 
        self.life = self.horizon # periods of life left
        
        # dp
        self.dp = self.data['signal'].iloc[self.date]
        
        return self.dp
        

    def render(self):
        print('Date: %s\n' % self.data['yyyymm'].iloc[self.date])
        print('Merton: %.2f\n' % self.merton)
        print('RoboMerton: %.2f\n' % self.port)

In [None]:
# random agent investing
env = MertonLogSignal()
state , done = env.reset(), False
for i in range(10):
    
    # random action
    action = env.action_space.sample() 
    
    # get new state and reward
    state, reward, done, info = env.step(action) 
        
    clear_output(wait=True)
    env.render()
    sleep(5)

In [None]:
%matplotlib inline
# Q learning
env = MertonLogSignal()
env.seed(34)
Q = np.zeros([env.observation_space.n, env.action_space.n])

for ep in range(50000):
    
    if (ep+1) % 10000 == 0:
        print('episode %d' % (ep+1))
        plt.figure()
        plt.plot(Q.T)
        plt.legend(['signal = 0','signal = 1'])
        plt.title('Q after Episode %d\n' % (ep+1))

    
    # initialise episode
    state , done = env.reset(), False
    
    # set learning rate and exploration parameters
    alpha = 1e-2 if ep < 10000 else 1e-3
    epsilon = 1 if ep < 50000 else 0.1
    
    while not done:
        
        # choose action using epsilon greedy
        if np.random.uniform() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])
            
        # draw new state and reward
        newstate, reward, done, info = env.step(action) 
        
        # target / update
        y = reward + np.max(Q[newstate])
        Q[state,action] -=  alpha * (Q[state,action]-y)
        
        state = newstate