In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import torch
import gpytorch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random 
from collections import namedtuple, deque 

In [2]:
BUFFER_SIZE = int(5*1e5)  #replay buffer size
BATCH_SIZE = 4      # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3             # for soft update of target parameters
LR = 1e-4            # learning rate
UPDATE_EVERY = 4      # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class GPModelAlgorithm(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPModelAlgorithm, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.AdditiveKernel(
                gpytorch.kernels.RBFKernel(),
                gpytorch.kernels.MaternKernel(nu=0.5)  # Adjust nu for smoothness
            )
        )
        self.linear_module = gpytorch.kernels.LinearKernel()

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x) + self.linear_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [4]:
class GPModel():
    def __init__(self):
        self.model = None
        self.likelihood = None
        self.predictions_lst = []

    def train(self, X, y): #expects two tensors
        # Set up the GP model with the new data
        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = GPModelAlgorithm(X, y, likelihood)

        # Use the entire training dataset to train the model
        model.train()
        likelihood.train()
        
        # Use ExactMarginalLogLikelihood to compute the loss
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        # Set up the optimizer and training loop
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        num_epochs = 10

        # Training loop
        for epoch in range(num_epochs):
            optimizer.zero_grad()
            output = model(X)
            loss = -mll(output, y)
            loss.backward()
            optimizer.step()
        
        self.model = model
        self.likelihood = likelihood
    
    def predict(self, X): #expects a tensor

        # Set the model to evaluation mode
        self.model.eval()
        self.likelihood.eval()

        # Make predictions
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            predictions = self.likelihood(self.model(X))

        # Extract predicted mean, lower, and upper bounds
        mean = predictions.mean.numpy()
        # lower, upper = predictions.confidence_region()
        self.predictions_lst.append(mean)
        return mean[0]

In [5]:
class ReplayBuffer:
    """Fixed -size buffer to store experience tuples."""
    
    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.
        
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experiences = namedtuple("Experience", field_names=["state",
                                                               "action",
                                                               "reward",
                                                               "next_state",
                                                               "done"])
        self.seed = random.seed(seed)
        
    def add(self,state, action, reward, next_state,done):
        """Add a new experience to memory."""
        e = self.experiences(state,action,reward,next_state,done)
        self.memory.append(e)
        
    def sample(self):
        """Randomly sample a batch of experiences from memory"""
    
        experiences = random.sample(self.memory,k=self.batch_size)
        
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        
        return (states,actions,rewards,next_states,dones)
    
    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [6]:
class QNetwork(nn.Module):
    """ Actor (Policy) Model."""
    def __init__(self, state_size, action_size, seed, fc1_unit=1024, fc2_unit = 1024):
        """
        Initialize parameters and build model.
        Params
        =======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_unit (int): Number of nodes in first hidden layer
            fc2_unit (int): Number of nodes in second hidden layer
        """

        super(QNetwork,self).__init__() ## calls __init__ method of nn.Module class
        self.seed = torch.manual_seed(seed)
        self.fc1= nn.Linear(state_size,fc1_unit)
        self.fc2 = nn.Linear(fc1_unit,fc2_unit)
        self.fc3 = nn.Linear(fc2_unit, action_size)
        
    def forward(self,x):
        """
        Build a network that maps state -> action values.
        """
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return self.fc3(x)

In [7]:
class StockEnv():
    def __init__(self, trading_days, current_period, dates, stock_data, gp_model, available_money = 10000):

        self.n_period = trading_days
        self.current_period = current_period
        self.stock_data = stock_data
        self.dates = dates
        self.available_money = available_money
        self.gp_model = gp_model
        self.invested = 0

        self.gp_model.train(self.dates[:self.current_period+1], stock_data[:self.current_period+1])
        prediction_price = self.gp_model.predict(dates[self.current_period : self.current_period+1])
        
        self.state = np.array([self.stock_data[self.current_period], prediction_price])

        self.state_list = []
        self.state_list.append(self.state)
        self.action_list = []
        self.reward_list = [] 


    def reset(self, trading_days, dates, stock_data, gp_model, available_money = 10000, current_period = 0):
    
        self.state_list = []
        self.action_list = []
        self.reward_list = []
        self.n_period = trading_days
        self.current_period = current_period
        self.stock_data = stock_data
        self.dates = dates
        self.available_money = available_money
        self.gp_model = gp_model
        self.invested = 0

        self.current_period = current_period

        gp_model.train(dates[:current_period+1], stock_data[:current_period+1])
        prediction_price = gp_model.predict(dates[current_period : current_period+1])

        self.state = np.array([self.stock_data[current_period], prediction_price])
        self.state_list.append(self.state)

        return self.state
    

    def step(self, action):
        if self.current_period + 1 < self.n_period:
            if action != 0: 
                reward = ((self.stock_data[self.current_period + 1] - self.stock_data[self.current_period])/self.stock_data[self.current_period]) * self.invested
            if action == 0:
                reward = 0
            self.invested = (self.stock_data[self.current_period + 1]/self.stock_data[self.current_period]) * self.invested
        else:
            reward = 0

        if self.invested < 0:
            self.invested = 0
        self.current_period += 1
        if self.current_period < self.n_period:
            #this sohuld be current_period for train only
            self.gp_model.train(self.dates[:self.current_period+1], self.stock_data[:self.current_period+1])
            prediction_price = self.gp_model.predict(self.dates[self.current_period : self.current_period+1])
            self.state = np.array([self.stock_data[self.current_period], prediction_price])
        else:
            prediction_price = 0
            self.state = np.array([0,0])

        self.state_list.append(self.state)

        self.action_list.append(action)
        self.reward_list.append(reward)

        if self.current_period >= self.n_period:
            terminate = True
        else: 
            terminate = False
        
        if self.available_money ==0 and self.invested == 0:
            terminate = True
        return self.state, reward, terminate

In [8]:
def dqn(env, dates_tensor, data_tensor, gp_model, n_episodes= 10, max_t = 1000, eps_start=1.0, eps_end = 0.01, eps_decay=0.99):
    """Deep Q-Learning
    
    Params
    ======
        n_episodes (int): maximum number of training epsiodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon 
        eps_decay (float): mutiplicative factor (per episode) for decreasing epsilon
        
    """
    scores = [] # list containing score from each episode
    eps = eps_start
    for i_episode in range(1, n_episodes+1):
        state = env.reset(trading_days = len(dates_tensor), dates = dates_tensor, stock_data = data_tensor, gp_model = gp_model, current_period = 0)
        score = 0
        for t in range(max_t):

            action = agent.act(state, eps)
            action = action - agent.action_size/2
            if action > env.available_money and action > 0:
                action = env.available_money

            if action < 0 and np.abs(action) > env.invested:
                action = -env.invested

                            
            env.available_money = env.available_money - action
            env.invested = env.invested + action
            
            next_state,reward,done = env.step(action)

            agent.step(state,action,reward,next_state,done)

            ## above step decides whether we will train(learn) the network
            ## actor (local_qnetwork) or we will fill the replay buffer
            ## if len replay buffer is equal to the batch size then we will
            ## train the network or otherwise we will add experience tuple in our 
            ## replay buffer.

            state = next_state
            score += reward
            if done:
                print('episode '+str(i_episode)+' : ', score)
                scores.append(score)
                break

        print('available money')
        print(env.available_money)
        print('invested money')
        print(env.invested)
        eps = max(eps*eps_decay,eps_end)
    return scores

In [9]:
class Agent():
    """Interacts with and learns form environment."""
    
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        =======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        
        #Q- Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),lr=LR)
        
        # Replay memory 
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
    def step(self, state, action, reward, next_step, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_step, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step+1)% UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get radom subset and learn

            if len(self.memory)>BATCH_SIZE:
                experience = self.memory.sample()
                self.learn(experience, GAMMA)
        
    def act(self, state, eps = 0):
        """Returns action for given state as per current policy
        Params
        =======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        #Epsilon -greedy action selction
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
            
    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        =======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        ## TODO: compute and minimize the loss
        criterion = torch.nn.MSELoss()
        # Local model is one which we need to train so it's in training mode
        self.qnetwork_local.train()
        # Target model is one with which we need to get our target so it's in evaluation mode
        # So that when we do a forward pass with target model it does not calculate gradient.
        # We will update target model weights with soft_update function
        self.qnetwork_target.eval()
        #shape of output from the model (batch_size,action_dim) = (64,4)

        actions = actions + self.action_size/2
        actions = actions.to(dtype=torch.int64)
        predicted_targets = self.qnetwork_local(states).gather(1,actions)

        with torch.no_grad():
            labels_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # .detach() ->  Returns a new Tensor, detached from the current graph.
        labels = rewards + (gamma* labels_next*(1-dones))

        loss = criterion(predicted_targets,labels).to(device)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local,self.qnetwork_target,TAU)
            
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        =======
            local model (PyTorch model): weights will be copied from
            target model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                           local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)
            


In [10]:
# Function to fetch stock data
def get_stock_data(symbol, start_date, end_date):
    stock_data = yf.download(symbol, start=start_date, end=end_date)
    stock_data = stock_data.resample('W').mean()
    return stock_data['Open'], stock_data.index

In [11]:
btc_symbol =  "BTC-USD"
start_date = "2014-09-17"
end_date = "2023-12-06"

In [12]:
btc_data, btc_dates = get_stock_data(btc_symbol, start_date, end_date)
dates = np.arange(len(btc_dates))

[*********************100%%**********************]  1 of 1 completed


In [13]:
dates_tensor = torch.tensor(dates, dtype=torch.float32).view(-1)
data_tensor = torch.tensor(btc_data, dtype=torch.float32)

In [14]:
agent = Agent(state_size=2, action_size=20000,seed=0)
gp_model = GPModel()

env = StockEnv(trading_days = len(dates), current_period = 0, dates = dates_tensor, stock_data = data_tensor, gp_model = gp_model, available_money = 10000)

scores= dqn(env, dates_tensor = dates_tensor, data_tensor=data_tensor, gp_model=gp_model)

plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Reward')
plt.xlabel('Episode #')
plt.show()

torch.save(agent.qnetwork_local.state_dict(), 'path')



episode1: tensor(221238.8594)
available money
tensor(197067.)
invested money
tensor(52162.2422)
episode2: tensor(566036.8125)
available money
tensor(28820.)
invested money
tensor(527778.4375)
episode3: tensor(55036.1367)
available money
tensor(7662.)
invested money
tensor(67964.3125)
episode4: tensor(614549.5000)
available money
tensor(123017.)
invested money
tensor(473222.9062)
episode5: tensor(122152.2422)
available money
tensor(92607.7109)
invested money
tensor(3729.4622)
episode6: tensor(152192.9062)
available money
tensor(7758.)
invested money
tensor(182199.8906)
episode7: tensor(104858.1484)
available money
tensor(65305.4609)
invested money
tensor(50523.5000)
episode8: tensor(88991.1172)
available money
tensor(33347.2344)
invested money
tensor(70906.8516)
episode9: tensor(72164.6641)
available money
tensor(52609.0469)
invested money
tensor(29412.0742)
episode10: tensor(91905.2812)
available money
tensor(30147.0312)
invested money
tensor(78241.2969)
episode11: tensor(191441.0625)


KeyboardInterrupt: 