# Libraries

In [None]:
import gymnasium as gym
from gymnasium import spaces, vector
import ray
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import numpy as np
import csv
import random
import logging

# Parameters

In [None]:
CUSTOMERS = 3000
PERIODS = 100
STARTING_PRICES = [300,300,300]
LARGEST_DISCOUNT = 30 # most you can decrease your prices in one period
LARGEST_INCREASE = 30 # most you can increase your prices in one period
# INERTIA = min(prices)/10
INERTIA = 10**10

# Functions

In [None]:
# A simple functions for determining consumer movment dependent on prices
# Takes in a list of prices of the firms and returns a list of how many customers each firm will have
def calculate_market_share(prices):
    num_companies = len(prices)

    # Calculate inverse prices to distribute customers (lower price gets more customers)
    inverse_prices = [(1/INERTIA)+ (1 / (price + 0.01)) for price in prices]
    total_inverse = sum(inverse_prices)

    # Calculate market share for each company based on inverse price proportion
    market_shares = [(inverse_price / total_inverse) * CUSTOMERS for inverse_price in inverse_prices]

    # Convert to integers and ensure the total is 3000 (handle any rounding issues)
    market_shares = [int(share) for share in market_shares]
    difference = CUSTOMERS - sum(market_shares)

    # Adjust the rounding difference if necessary
    for i in range(abs(difference)):
        market_shares[i % num_companies] += 1 if difference > 0 else -1

    return market_shares

In [None]:
print(calculate_market_share(STARTING_PRICES))

[1000, 1000, 1000]


In [None]:
10**8

100000000

In [None]:
class InertiaEnv(MultiAgentEnv):
    def __init__(self, seed=None):
        super(InertiaEnv, self).__init__()
        #logging.basicConfig(level=logging.DEBUG)
        self.t_steps = 0
        self.num_agents = len(STARTING_PRICES)
        self._agent_ids = [f'agent_{i}' for i in range(self.num_agents)]

        self.action_space = spaces.Dict({
            agent: spaces.Box(low=0-LARGEST_DISCOUNT,high=LARGEST_INCREASE,dtype=np.int32)
            for agent in self._agent_ids
        })

        self.observation_space = spaces.Dict({
                agent: spaces.Dict({'price': spaces.Box(low=0, high=max(STARTING_PRICES)+30*LARGEST_INCREASE,dtype=np.int32),
                                    'market_prices': spaces.Box(low=0, high=max(STARTING_PRICES)+30*LARGEST_INCREASE,  shape=(len(STARTING_PRICES),), dtype=np.int32),
                                    'market_quantities' :spaces.Box(low=0, high=max(STARTING_PRICES)+30*LARGEST_INCREASE,  shape=(len(STARTING_PRICES),) ,dtype=np.int32),
                                             })
                for agent in self._agent_ids
            })
        self.reset()

    def step(self,actions):
        self.t_steps += 1
        self.current_period += 1
        obs = {}
        rewards = {}
        terminateds = {}
        truncateds = {}
        info = {}
        prices = self.prices
        for i,(agent_id,action) in enumerate(actions.items()):
            truncateds[agent_id] = False
            terminateds[agent_id] = False
            prices[i] += action
            if prices[i]<0:
                prices[i] = 0
            elif prices[i]>max(STARTING_PRICES)+30*LARGEST_INCREASE:
                prices[i] = max(STARTING_PRICES)+30*LARGEST_INCREASE

        # This formula will be updated with the mathematical model generated from lit review
        self.quantities = calculate_market_share(prices)
        self.prices = prices

        for i,(agent_id,action) in enumerate(actions.items()):
            rewards[agent_id] = self.quantities[i]*prices[i]



        truncateds['__all__'] = all(truncateds.values())

        if self.current_period>PERIODS:
            for agent_id, state in actions.items():
                terminateds[agent_id] = True
            terminateds['__all__'] = all(terminateds.values())


        for i,(agent_id, action) in enumerate(actions.items()):
            obs[agent_id] = self._get_obs(i)
        terminateds['__all__'] = all(terminateds.values())

        return obs,rewards,terminateds,truncateds,info

    def reset(self,*, seed=None, options=None):
        self.current_period = 0
        self.prices = STARTING_PRICES.copy()
        self.quantities =calculate_market_share(STARTING_PRICES)
        self.states = {
            agent_id: {
                'price': STARTING_PRICES[i],
                'market_prices': np.array(STARTING_PRICES),
                'market_quantities': np.array(calculate_market_share(STARTING_PRICES))
            }
            for i,agent_id in enumerate(self._agent_ids)
        }
        obs = {}
        for i,agent_id in enumerate(self._agent_ids):
            obs[agent_id] = self._get_obs(i)

        return obs, {}

    def _get_obs(self,agent_id):
        obs = {
                'price': self.prices[agent_id],
                'market_prices': self.prices,
                'market_quantities': self.quantities
            }
        return obs


In [None]:
env = InertiaEnv()
obs, info =env.reset()
obs

{'agent_0': {'price': 300,
  'market_prices': [300, 300, 300],
  'market_quantities': [1000, 1000, 1000]},
 'agent_1': {'price': 300,
  'market_prices': [300, 300, 300],
  'market_quantities': [1000, 1000, 1000]},
 'agent_2': {'price': 300,
  'market_prices': [300, 300, 300],
  'market_quantities': [1000, 1000, 1000]}}

In [None]:
env.step(env.action_space_sample())

  market_shares = [int(share) for share in market_shares]


({'agent_0': {'price': array([303]),
   'market_prices': [array([303]), array([301]), array([298])],
   'market_quantities': [993, 999, 1008]},
  'agent_1': {'price': array([301]),
   'market_prices': [array([303]), array([301]), array([298])],
   'market_quantities': [993, 999, 1008]},
  'agent_2': {'price': array([298]),
   'market_prices': [array([303]), array([301]), array([298])],
   'market_quantities': [993, 999, 1008]}},
 {'agent_0': array([300879]),
  'agent_1': array([300699]),
  'agent_2': array([300384])},
 {'agent_0': False, 'agent_1': False, 'agent_2': False, '__all__': False},
 {'agent_0': False, 'agent_1': False, 'agent_2': False, '__all__': False},
 {})

In [None]:
env.reset()

({'agent_0': {'price': 300,
   'market_prices': [300, 300, 300],
   'market_quantities': [1000, 1000, 1000]},
  'agent_1': {'price': 300,
   'market_prices': [300, 300, 300],
   'market_quantities': [1000, 1000, 1000]},
  'agent_2': {'price': 300,
   'market_prices': [300, 300, 300],
   'market_quantities': [1000, 1000, 1000]}},
 {})

# Training

In [None]:
if ray.is_initialized():
  ray.shutdown()
ray.init(ignore_reinit_error=True)

In [None]:
ray.available_resources()

In [None]:
import os
print(os.getcwd())
# CHANGE TO PATHING FOR YOUR COMPUTER
save_dir = ""

In [None]:
from ray import air, tune
from ray.rllib.algorithms.ppo import PPOConfig


from ray.rllib.models import ModelCatalog
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from functools import partial

num_policies = 4 # each agent will have its own policy
timesteps_total = 500000 #1000000
max_training_iteration = 10000
num_agents = 4


agent_ids = MultiAgentNBAEnv()._agent_ids
sym_policies = {agent_id: f"policy_agent_0" for agent_id in agent_ids}  # Symmetric
asym_policies = {agent_id: f"policy_{agent_id}" for agent_id in agent_ids}  # Asymmetric
def policy_mapping_fn(agent_id, episode, worker, *, policies=None, **kwargs):
    return policies[agent_id]


policies = asym_policies
policy_mapping = partial(policy_mapping_fn, policies=policies)

config = (
        PPOConfig()
        .environment(MultiAgentNBAEnv)
        .framework('torch')
        .training(train_batch_size=5000)
        .resources(num_gpus=0)
        .debugging(seed=0)
        .rollouts(num_rollout_workers=10, num_envs_per_worker=5, rollout_fragment_length="auto")
        .multi_agent(policies=list(policies.values()),
                     policy_mapping_fn=policy_mapping)
    )

config.num_sgd_iter = 10
config.sgd_minibatch_size = 1000
config.entropy_coeff = 0.01

stop = {
        "timesteps_total": timesteps_total,
        "training_iteration": max_training_iteration,
    }


# THE WAY OF CODING IS A BIT DIFFERENT HERE FROM THE SINGLE AGENT ONE.
# WE CAN MAKE IT EXACTLY THE SAME IF IT DIDNT WORK. SPECIFICALLY, config() and results are defined a bit differently.
results = tune.Tuner(
        "PPO",
        param_space=config.to_dict(),
        run_config=air.RunConfig(
            storage_path=save_dir,
            stop=stop,
            # Save a maximum X checkpoints at every N training_iteration
            checkpoint_config=air.CheckpointConfig(
                checkpoint_frequency=5,
                checkpoint_at_end=True)
        ),
    ).fit()


###if args.as_test:
 ###   check_learning_achieved(results, args.stop_reward)
ray.shutdown()