In [86]:
from numba import njit
import numpy as np
import sys

In [87]:
# Numba-based indexing function from StackOverflow
# https://stackoverflow.com/a/41578614

@njit
def index(array, item):
    """
    Find the index of the first occurrence of a given item in a numpy array.

        Parameters:
                array (Array): The array to search in.
                item (float): The item to search for.

        Returns:
                Tuple[int, ...]: The index of the first occurrence of `item` in `array`.
                The returned index is a tuple of integers representing the position
                of the item along each dimension of the array

        Raises:
            ValueError: If the item is not found in the array.
    """
    for idx, val in np.ndenumerate(array):
        if val == item:
            return idx

In [88]:
class Player2D:
    def __init__(self):
        """
        Initializes the player for 2 dimensional game. 

            Parameters:
                    None
            Returns:
                    Nothing, but contains the list for profits.       
        """
        self.profits = list()

    def BR(self):
        """
        Best response function for the Player.  

            Parameters:
                    None
            Returns:
                    Nothing at the moment.       
        """
        return
    def demand(self, p1:float, p2:float) -> float:
        """
        Demand function for the player. 

            Parameters:
                    p1 (float): the price for player 0
                    p2 (float): the price for player 1 
            Returns:
                    Returns the share of the market that the player gets.        
        """
        if (p1 < p2):
            return 1 - p1
        elif (p1 == p2):
            return 0.5 * (1 - p1)
        elif (p1 > p2):
            return 0
    def profit(self, p1:float, p2:float):
        """
        Profit function for the player. 

            Parameters:
                    p1 (float): the price for player 0
                    p2 (float): the price for player 1 
            Returns:
                    The profit of player 0.        
        """
        return p1 * self.demand(p1, p2)

In [89]:
class Qlearner2D(Player2D):
    # ONLY WORKS IN 2 DIMENSIONS FOR NOW
    def __init__(self, k, T):
        """
        Initializes the Q-learner. 

            Parameters:
                    k (integer): Number of prices. 
                    T (interger): Number of runs
            Returns:
                    Nothing, but initilalizes all variables for the Q-learner.        
        """
        Player2D.__init__(self)
        
        self.Q_table = np.zeros([k] * 2)
        
        self.prices = np.linspace(0, 1, k, endpoint=True)
        
        self.ps = np.empty(T)
        self.ps.fill(np.nan)
        
        self.alpha = 0.3
        self.delta = 0.95
        self.t = 0
        self.theta = -(1/1000000)**(1/T) + 1
        self.epsilon = (1 - self.theta)**self.t
    def initialize_start(self):
        """
        Initializes the starting prices for the Q-learner. This uses 2 time periods, resulting in t = 2.

            Parameters:
                    None
            Returns:
                    Nothing, but updates starting prices for the two first periods.
        """
        self.ps[0:2] = np.random.choice(self.prices)
        self.t = 2
    def update(self, t, s, s_next):
        """
        Part of the exploration module.
        Updates the Q-table for this Q-learner (1 time period)

            Parameters:
                    s (float): Our state (opponent price) for t
                    s_next (float): Our state (opponent price) for t+1

            Returns:
                    Nothing, but updates Q-table with a new Q-value.
        """
        delta = self.delta
        alpha = self.alpha
        
        p = self.ps[t]
        #print(p)
        prev_est = self.Q_table[index(self.prices, p), index(self.prices, s)]
        maxed_Q = max(self.Q_table[:, index(self.prices, s_next)])
        new_est = self.profit(p, s) + delta * self.profit(p, s_next) + delta**2 * maxed_Q
        self.Q_table[index(self.prices, p), index(self.prices, s)] = (1 - self.alpha) * prev_est + alpha * new_est
        return
        
    def set_price(self, s):
        """
        Part of the action module.
        Sets the price randomly with probability epsilon or sets the price that maximizes the Q-value
        given the state (opponent price) with probability (1 - epsilon).
        
            Parameters: 
                s (float): Our state (opponent price) for t
            Returns:
                Nothing, but puts best response price from array of possible prices in price array
        """
        if self.epsilon >= np.random.uniform(0,1):
            self.ps[self.t] = np.random.choice(self.prices)
            return 
        else: 
            maxedQ_idx = np.argmax(self.Q_table[:, index(self.prices, s)])
            self.ps[self.t] = self.prices[maxedQ_idx]
            return 

In [90]:
class Game:
    def __init__(self, k, T, N):
        """
        Initialize a new instance of the Game class.

        Parameters:
                k (float): The number of prices. 
                T (float): The total time to simulate.
                N (int): The number of time steps to use in the simulation.
        Returns: 
                Nothing 
        """

        self.k = k
        self.t = 0
        self.T = T
        self.N = N
    def update_profit(self, player : Player2D, s):
        """
        Update the profit of a player based on the current price and state.

        This method calculates the profit of the player using the current price
        and state, and appends it to the player's list of profits.

        Parameters:
            player (Player2D): The player whose profit should be updated.
            s (float): The current state.

        Returns:
            Nothing but updates the profit array.
        """
        price = player.ps[player.t]
        player.profits.append(player.profit(price, s))
    
    def simulate(self):
        # TODO: figure out difference in t between Q-learner and Game classes
        
        qlearner0 = Qlearner2D(self.k,self.T) #initialize players
        qlearner1 = Qlearner2D(self.k, self.T)
        
        players = [qlearner0, qlearner1]
        
        i = 0
        j = 1

        profitabilities0 = []
        profitabilities1 = []
        
        qlearner0.initialize_start()
        qlearner1.initialize_start()
        self.t = 2 # after inititialization, t = 2
        while self.t < self.T:
            t = self.t
            players[i].t = self.t
            players[j].t = self.t
            
            # exploration module
            players[i].update(t-2, players[j].ps[t-2], players[j].ps[t-1])

            # action module
            players[i].set_price(players[j].ps[t-1]) # set price according to state (player j's current price)
            players[j].ps[t] = players[j].ps[t-1]
            
            # write profits for firm i and j
            self.update_profit(players[i], players[j].ps[t])
            
            
            self.update_profit(players[j], players[i].ps[t])
                
            #compute avg profitability of last 1000 runs
            if t % 50000 == 0:
                profitability0 = np.sum(players[0].profits[(t-1000):t])/1000
                profitability1 = np.sum(players[1].profits[(t-1000):t])/1000
                profitabilities0.append(profitability0)
                profitabilities1.append(profitability1)

            # calculate new epsilon using decay parameter
            players[i].epsilon = (1 - players[i].theta)**t
            players[j].epsilon = (1 - players[j].theta)**t
            
            tmp = i
            i = j
            j = tmp
            self.t += 1
        return profitabilities0, profitabilities1

In [91]:
# create game and simulate

game = Game(6, 250000, 1)
prof0, prof1 = game.simulate()