In [1]:
!pip install tensorflow==2.4.1
!pip install gym
!pip install keras
!pip install keras-rl2

Collecting keras-rl2
[?25l  Downloading https://files.pythonhosted.org/packages/dd/34/94ffeab44eef43e22a01d82aa0ca062a97392c2c2415ba8b210e72053285/keras_rl2-1.0.4-py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 3.4MB/s 
Installing collected packages: keras-rl2
Successfully installed keras-rl2-1.0.4


In [2]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random

In [3]:
class ShowerEnv(Env):
    def __init__(self, resources=[90, 90], num_tasks=5):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(num_tasks)
        # Temperature array
        self.observation_space = Box(low=0, high=90, shape=(num_tasks, len(resources)))
        # Set state
        self.state_ = np.zeros((num_tasks, 1), dtype=int)
        self.limit_ = 0
        for lim in resources:
            self.state_ = np.append(self.state_, np.random.randint(lim, size=(num_tasks, 1)), axis=1)
            
        self.reward_ = 0
        for lim in resources:
            self.limit_+=lim   #Set the total resource limit as sum of resources
       
        self.num_tasks_ = num_tasks
        self.resources_ = resources
        
    def getSum(self, x ):
        if x[0] == 1:
            return sum(x)-1
        else:
            return 0    
    
    def step(self, actionIdx):
        ### update reward
        self.reward_ = 0
        reward = 0
        done = False
        #update the state based on action
        #case 1, if same task selected then penalize the agent
        if self.state_[actionIdx][0] == 1:
          # get the sum for selected task
            sum_res = np.sum(self.state_[actionIdx], axis=0)
            reward = -sum_res/10;
            return self.state_,reward,done,{}
        else:
            self.state_[actionIdx][0] = 1
       
        #collect all the resources for this batch until now.
        totReward = sum(np.apply_along_axis( self.getSum, axis=1, arr=self.state_ ))
        # get the sum for selected task
        sum_res = np.sum(self.state_[actionIdx], axis=0) 
        if totReward <= self.limit_:
            reward = sum_res
        else:
            reward = -sum_res/10;
            done = True
            
        return self.state_,reward,done,{}

    def render(self):
        # Implement viz
        print(self.state_)
    
    def reset(self):
        state = np.zeros((self.num_tasks_, 1), dtype=int)
        for lim in self.resources_: 
            state = np.append(state, np.random.randint(lim, size=(self.num_tasks_,1)), axis=1)
        self.state_ = state
        self.reward_ = 0
        return self.state_

In [4]:
env = ShowerEnv()

In [5]:
env.observation_space.sample()

array([[33.92447 , 14.744231],
       [79.3372  , 53.220406],
       [13.458334, 28.118078],
       [80.5857  , 57.969727],
       [83.571045, 22.232521]], dtype=float32)

In [6]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        # print(n_state)
        # print(action, " ", dqn.forward(n_state))
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:135.00000000000003
Episode:2 Score:166.2
Episode:3 Score:98.50000000000001
Episode:4 Score:96.5
Episode:5 Score:110.9
Episode:6 Score:102.10000000000001
Episode:7 Score:73.3
Episode:8 Score:163.79999999999998
Episode:9 Score:131.5
Episode:10 Score:121.80000000000001


In [7]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow import reshape

In [8]:
states = (1, 5, 3)
actions = env.action_space.n

print(states)
print(actions)

(1, 5, 3)
5


In [20]:
def build_model(states, actions, l1=32, l2=32, l3=32):
    model = Sequential()
    model.add(Flatten(input_shape=states))
    model.add(Dense(l1, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(l2, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(l3, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(actions, activation='linear'))
    return model

In [10]:
model = build_model(states, actions)

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 15)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                512       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 165       
Total params: 2,789
Trainable params: 2,789
Non-trainable params: 0
_________________________________________________________________


In [21]:
from rl.agents import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [22]:
def build_agent(model, actions):
    policy = EpsGreedyQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=200, target_model_update=1e-2)
    return dqn

In [83]:
dqn = build_agent(model, actions)

In [84]:
dqn.compile(Adam(lr=1e-4), metrics=['mae'])
dqn.fit(env, nb_steps=1000, visualize=False, verbose=1)

Training for 1000 steps ...
Interval 1 (0 steps performed)




 1000/10000 [==>...........................] - ETA: 2:15 - reward: -1.6812done, took 15.705 seconds


<tensorflow.python.keras.callbacks.History at 0x7f61d14c5b10>

In [85]:
dqn.compile(Adam(lr=1e-2), metrics=['mae'])
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)




done, took 166.658 seconds


<tensorflow.python.keras.callbacks.History at 0x7f61d259b850>

In [86]:
dqn.compile(Adam(lr=0.5*1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)






<tensorflow.python.keras.callbacks.History at 0x7f61d101a5d0>

In [55]:
episodes = 1000
scores = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        # print(n_state)
        # print(action, " ", dqn.forward(n_state))
        score+=reward
    scores += score
print('Episodes:{} Score:{}'.format(episodes, scores/episodes))

Episodes:1000 Score:113.23490000000008


In [23]:
def calc_score(dqn, episodes = 1000):
  scores = 0
  for episode in range(1, episodes+1):
      state = env.reset()
      done = False
      score = 0 
      i = 0
      
      while not done:
          #env.render()
          action = dqn.forward(state)
          state, reward, done, info = env.step(action)
          # print(n_state)
          # print(action, " ", dqn.forward(n_state))
          score+=reward
      scores += score
  print('Episodes:{} Score:{}'.format(episodes, scores/episodes))
  return scores/episodes

In [57]:
episodes = 1000
scores = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    i = 0
    
    while not done:
        #env.render()
        action = i
        i = i+1
        state, reward, done, info = env.step(action)
        # print(n_state)
        # print(action, " ", dqn.forward(n_state))
        score+=reward
    scores += score
print('Episodes:{} Score:{}'.format(episodes, scores/episodes))

Episodes:1000 Score:121.14489999999995


In [None]:
nodes = [256, 128, 64, 32, 16]
max_reward = 0
l1 = l2 = l3 = 0

for i in nodes:
  for j in nodes:
    for k in nodes:
      model = build_model(states, actions, i, j, k)
      dqn = build_agent(model, actions)
      dqn.compile(Adam(lr=1e-4), metrics=['mae'])
      dqn.fit(env, nb_steps=2000, visualize=False, verbose=1)
      dqn.compile(Adam(lr=1e-2), metrics=['mae'])
      dqn.fit(env, nb_steps=15000, visualize=False, verbose=1)
      dqn.compile(Adam(lr=0.5*1e-3), metrics=['mae'])
      dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)
      score = calc_score(dqn)
      if (score > max_reward):
        max_reward = score
        l1 = i
        l2 = j
        l3 = k

print(max_reward, ": ", l1, " ", l2, " ", l3)

Training for 2000 steps ...
Interval 1 (0 steps performed)




  187/10000 [..............................] - ETA: 16s - reward: 3.3037

In [19]:
print(max_reward, ": ", l1, " ", l2, " ", l3)

91.29059999999991 :  256   256   256
