In [7]:
import pandas as pd
import numpy as np

from stable_baselines import A2C, DQN
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv

import sys, os
sys.path.append('/home/jovyan/work/biddingStrategy/src')

from environments import SingleAgentTrainingEnv
from agents import UniformRandomAgent, GymRLAgent
from info_settings import OfferInformationSetting
from engine import MarketEngine

In [8]:
fixed_agents = [
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
]

rl_agent = GymRLAgent('buyer', 110, discretization=20)
setting = OfferInformationSetting(5)

def get_env(rl_agent, fixed_agents, setting):
    return SingleAgentTrainingEnv(rl_agent, fixed_agents, setting)

env = DummyVecEnv([lambda: get_env(rl_agent, fixed_agents, setting)]) # wrap it for baselines

In [9]:
model = DQN("MlpPolicy", env, verbose=1, learning_rate=0.05)







Instructions for updating:
Use keras.layers.flatten instead.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where








In [10]:
%%time
model.learn(total_timesteps=10000)

--------------------------------------
| % time spent exploring  | 41       |
| episodes                | 100      |
| mean 100 episode reward | 7.4      |
| steps                   | 596      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 4        |
| episodes                | 200      |
| mean 100 episode reward | 6.2      |
| steps                   | 971      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 300      |
| mean 100 episode reward | 7.2      |
| steps                   | 1099     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 400      |
| mean 100 episode reward | 8.4      |
| steps                   | 1222     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 3700     |
| mean 100 episode reward | 12.9     |
| steps                   | 8748     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 3800     |
| mean 100 episode reward | 12.8     |
| steps                   | 9105     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 3900     |
| mean 100 episode reward | 13.3     |
| steps                   | 9483     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 4000     |
| mean 100 episode reward | 12.8     |
| steps                   | 9887     |
--------------------------------------
CPU times: user 1min 7s, sys: 9.9 s, total: 1min 17s
Wall time: 

<stable_baselines.deepq.dqn.DQN at 0x7f0de363fd30>

In [11]:
rl_agent.model = model

In [12]:
def get_reward(agent, deals):
    if not agent.name in deals:
        return 0

    deal_price = deals[agent.name]
    if agent.role == 'buyer':
        return agent.reservation_price - deal_price
    else:
        return deal_price - agent.reservation_price

In [13]:
def play_games(agents, setting, n_games=100, max_steps=30):
    buyer_ids =  [
        agent.name
        for agent in agents
        if agent.role == 'buyer'
    ]
    seller_ids =  [
        agent.name
        for agent in agents
        if agent.role == 'seller'
    ]
    ids = set(buyer_ids + seller_ids)
    market = MarketEngine(buyer_ids, seller_ids, max_steps=max_steps)
    
    rewards = pd.DataFrame(0, index=np.arange(n_games), columns=ids)
    for game_idx in range(n_games):
        while market.done != ids:
            observations = setting.get_states(ids, market)
            unmatched_agents = [
                agent for agent in agents
                if agent.name not in market.done
            ]
            offers = {
                agent.name: agent.get_offer(observations[agent.name])
                for agent in unmatched_agents
            }
            deals = market.step(offers)
            for agent in unmatched_agents:
                rewards[agent.name][game_idx] = get_reward(agent, deals)
        market.reset()
    return rewards

In [14]:
play_games(fixed_agents + [rl_agent], setting, 10)

Unnamed: 0,Unif_S90_06ac,Unif_S90_b5bc,Unif_S90_7678,Unif_B110_59bb,Unif_B110_df1c,Unif_B110_fbfd,Unif_S90_a41f,GymR_B110_0005,Unif_S90_e677,Unif_B110_ad39,Unif_B110_6e33
0,7,10,14,5,9,0,6,12,16,13,3
1,8,12,12,0,7,9,10,11,8,11,7
2,7,5,13,9,14,6,18,12,10,0,1
3,15,11,11,4,8,8,16,0,15,4,3
4,15,4,14,0,5,6,13,15,12,7,4
5,6,8,7,11,0,18,8,13,1,12,11
6,13,2,17,10,6,0,5,14,9,17,2
7,15,13,15,4,4,0,8,11,14,6,5
8,7,8,12,14,11,7,5,12,17,0,2
9,8,8,11,8,11,11,8,11,13,6,0


In [15]:
play_games(fixed_agents + [rl_agent], setting, 100).describe()

Unnamed: 0,Unif_S90_06ac,Unif_S90_b5bc,Unif_S90_7678,Unif_B110_59bb,Unif_B110_df1c,Unif_B110_fbfd,Unif_S90_a41f,GymR_B110_0005,Unif_S90_e677,Unif_B110_ad39,Unif_B110_6e33
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,9.23,9.65,9.01,6.92,7.33,6.98,8.92,12.61,9.19,7.62,6.78
std,3.821404,3.602398,3.691363,4.996322,4.594804,4.888618,4.0444,1.797276,3.404379,4.931408,4.986001
min,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
25%,6.75,7.0,7.0,3.0,4.75,2.0,6.0,12.0,6.0,5.0,0.0
50%,9.0,9.0,8.0,7.0,7.0,8.0,8.0,13.0,9.0,8.0,7.0
75%,12.0,13.0,11.0,11.0,11.0,11.0,11.0,14.0,12.0,11.0,11.0
max,19.0,19.0,19.0,18.0,17.0,17.0,18.0,15.0,18.0,18.0,17.0


In [16]:
rl_seller = GymRLAgent('seller', 90, model=model, name='myseller')

In [17]:
play_games(fixed_agents + [rl_seller], setting, 100).describe()

Unnamed: 0,Unif_S90_06ac,myseller,Unif_S90_b5bc,Unif_S90_7678,Unif_B110_59bb,Unif_B110_df1c,Unif_B110_fbfd,Unif_S90_a41f,Unif_S90_e677,Unif_B110_ad39,Unif_B110_6e33
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,7.11,9.94,7.18,6.45,9.84,10.17,9.41,7.38,7.1,10.18,9.48
std,4.756484,4.009635,4.806099,4.606484,3.645171,3.749896,4.087663,4.46033,4.926736,3.082633,3.546174
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,2.0
25%,3.75,8.75,4.0,2.75,8.0,7.75,7.0,4.0,3.0,8.0,7.0
50%,7.5,11.0,7.5,7.0,9.0,10.0,9.0,8.0,7.5,10.0,9.0
75%,10.0,13.0,10.0,10.0,13.0,13.0,13.0,10.25,11.0,12.0,12.0
max,17.0,15.0,17.0,17.0,18.0,18.0,17.0,18.0,18.0,17.0,17.0
