In [None]:
import gym
from gym.spaces import Discrete, Box

import numpy as np

In [None]:
def next_state(items, state, action):
    idx = action // 2
    act = action % 2

    if idx < len(items):
        state[idx] += (1 if act == 1 else -1)

    return state

def calc_value(items, state, max_weight, burst_value):
    reward = 0
    weight = 0
    
    for i in range(len(state)):
        reward += items[i][0] * state[i]
        weight += items[i][1] * state[i]
    
    if weight > max_weight or min(state) < 0:
        reward = burst_value
    
    return reward, weight

# Env

In [None]:
class Knapsack(gym.Env):
    def __init__(self, config):
        self.items = config["items"]
        self.max_weight = config["max_weight"]
        self.episode_steps = config["episode_steps"]
        self.burst_reward = config["burst_reward"]
        self.bonus_rules = config["bonus_rules"]
        
        n = self.episode_steps
        
        self.action_space = Discrete(len(self.items) * 2 + 1)
        self.observation_space = Box(low = -n, high = n, shape = (len(self.items), ))
        
        self.reset()

    def reset(self):
        self.current_steps = 0
        self.state = [0 for _ in self.items]
        
        return self.state

    def step(self, action):
        self.state = next_state(self.items, self.state, action)
        
        r, _ = calc_value(self.items, self.state, self.max_weight, self.burst_reward)
        reward = r
        
        for (v, b) in self.bonus_rules:
            if r > v:
                reward += b
        
        self.current_steps += 1
        done = self.current_steps >= self.episode_steps
        
        return self.state, reward, done, {}

In [None]:
items = [
    [105, 10],
    [74, 7],
    [164, 15],
    [32, 3],
    [235, 22]
]


In [None]:
config = {
    "env": Knapsack, 
    "vf_clip_param": 60,
    "env_config": {
        "items": items, "episode_steps": 10, "max_weight": 35, "burst_reward": -100, 
        "bonus_rules": [ (375, 200) ]
    }
}


In [None]:
import ray

#ray.shutdown()
ray.init()


# PPO

In [None]:
from ray.rllib.agents.ppo import PPOTrainer

trainer = PPOTrainer(config = config)

# Train

In [None]:
r_max = []
r_min = []
r_mean = []


In [None]:
from ray.tune.logger import pretty_print

for _ in range(30):
    r = trainer.train()
    print(pretty_print(r))
    
    r_max.append(r["episode_reward_max"])
    r_min.append(r["episode_reward_min"])
    r_mean.append(r["episode_reward_mean"])


In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.plot(r_max, label = "reward_max", color = "red")
plt.plot(r_min, label = "reward_min", color = "green")
plt.plot(r_mean, label = "reward_mean", color = "blue")

plt.legend(loc = "upper left")
plt.ylabel("reward")

plt.show()

# Evaluate

In [None]:
s = [0 for _ in range(len(items))]

for _ in range(config["env_config"]["episode_steps"]):
    a = trainer.compute_action(s)
    
    s = next_state(items, s, a)
    
    r, w = calc_value(items, s, config["env_config"]["max_weight"], config["env_config"]["burst_reward"])
    
    print(f"{a}, {s}, {r}, {w}")
    

In [None]:
import collections

rs = []

for _ in range(1000):
    
    s = [0 for _ in range(len(items))]
    r_tmp = config["env_config"]["burst_reward"]

    for _ in range(config["env_config"]["episode_steps"]):
        a = trainer.compute_action(s)
        s = next_state(items, s, a)

        r, w = calc_value(items, s, config["env_config"]["max_weight"], config["env_config"]["burst_reward"])
        
        r_tmp = max(r, r_tmp)
        
        #print(f"{a}, {s}, {r}, {w}")

    rs.append(r_tmp)

collections.Counter(rs)

# Save

In [None]:
checkpoint = trainer.save()
checkpoint

# Load

In [None]:
trainer.restore(checkpoint)
