In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Flatten, InputLayer, Conv1D
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent, CEMAgent, DDPGAgent, SARSAAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

import numpy as np

from gym import Env
from gym.spaces import Box, Discrete

from os.path import exists

rng = np.random.default_rng()

In [3]:
class TakeItEasyEnv(Env):
    def __init__(self):
        self.action_space = Discrete(20, start=1)
        self.observation_space = Box(low=-100, high=96100, shape=(1,1,20,3), dtype=np.int32)
        self.state = np.zeros(shape=(20, 3), dtype=np.int32)
        self.episode_length = 19
        self.round_no = 0
        self.pieces = np.array([
            [2,1,3], [2,1,4], [2,1,8], [6,1,3], [6,1,4], [6,1,8], [7,1,3], [7,1,4], [7,1,8], 
            [2,5,3], [2,5,4], [2,5,8], [6,5,3], [6,5,4], [6,5,8], [7,5,3], [7,5,4], [7,5,8],
            [2,9,3], [2,9,4], [2,9,8], [6,9,3], [6,9,4], [6,9,8], [7,9,3], [7,9,4], [7,9,8]])
        self.cases = np.array([
            [[1,4,8,0,0], [2,5,9,13,0], [3,6,10,14,17], [7,11,15,18,0], [12,16,19,0,0]],
            [[1,2,3,0,0], [4,5,6,7,0], [8,9,10,11,12], [13,14,15,16,0], [17,18,19,0,0]], 
            [[3,7,12,0,0], [2,6,11,16,0], [1,5,10,15,19], [4,9,14,18,0], [8,13,17,0,0]]       
            ], np.int32)
        self.last_reward = 0
        self.selected_pieces = rng.choice(self.pieces, size=self.episode_length, replace=False)


        self.state[0] = self.selected_pieces[0]

    def count_points(self, state):
        points=0
        for i in range(3):
            for j in range(5):
                first_value = state[self.cases[i][j][0],i]
                for k in range(5):
                    if(self.cases[i][j][k] == 0):
                        points += first_value*k
                        break
                    if(self.state[self.cases[i][j][k], i] != first_value):
                        break
                    elif(k == 4):
                        points += first_value*(k+1)
                        break
        return points

    def reset(self):
        self.round_no = 0
        self.last_reward = 0
        self.state=np.zeros(shape=(20, 3), dtype=np.int32)
        self.selected_pieces = rng.choice(self.pieces, size=self.episode_length, replace=False)
        self.state[0] = self.selected_pieces[0]
        return self.state
    
    def step(self, action):
        self.round_no += 1
        reward = 0

        if(np.all(self.state[action] != 0) and self.round_no <= self.episode_length):
            reward = -100
            done = True
        else:
            if(self.round_no == self.episode_length):
                self.state[action] = self.state[0]
                reward = 2*self.count_points(self.state) - self.last_reward
                done = True
            else:
                self.state[action] = self.state[0]
                self.state[0] = self.selected_pieces[self.round_no]
                reward = self.count_points(self.state) - self.last_reward
                self.last_reward = self.count_points(self.state)
                done = False

        info = {}

        return self.state, reward, done, info


In [4]:
env=TakeItEasyEnv()

states = env.observation_space.shape
actions = env.action_space.n


In [5]:
def build_model(states, actions):
    model = Sequential()
    model.add(InputLayer(input_shape=(1,20,3)))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Flatten())
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1, 20, 256)        1024      
                                                                 
 dense_1 (Dense)             (None, 1, 20, 512)        131584    
                                                                 
 dense_2 (Dense)             (None, 1, 20, 256)        131328    
                                                                 
 flatten (Flatten)           (None, 5120)              0         
                                                                 
 dense_3 (Dense)             (None, 20)                102420    
                                                                 
Total params: 366,356
Trainable params: 366,356
Non-trainable params: 0
_________________________________________________________________


In [17]:
def build_agent(model, actions):
    memory = SequentialMemory(limit=20000000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy)
    
    return dqn

In [19]:
dqn = build_agent(model, actions)

dqn.compile(Adam(learning_rate=1e-2), metrics=['mae'])
if(exists('dqn_take-it-easy_weights.hdf5')):
    try:
        dqn.load_weights('dqn_take-it-easy_weights.hdf5')
    except:
        print('Could not load weights')
dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)
dqn.save_weights('dqn_{}_weights.hdf5'.format('take-it-easy'), overwrite=True)

Training for 100000 steps ...
Interval 1 (0 steps performed)
1768 episodes - episode_reward: -99.890 [-100.000, -76.000] - loss: 323.566 - mae: 45.049 - mean_q: -45.828

Interval 2 (10000 steps performed)
1622 episodes - episode_reward: -99.757 [-100.000, -64.000] - loss: 5.282 - mae: 88.823 - mean_q: -92.859

Interval 3 (20000 steps performed)
1637 episodes - episode_reward: -99.780 [-100.000, -73.000] - loss: 3.594 - mae: 89.943 - mean_q: -94.085

Interval 4 (30000 steps performed)
1628 episodes - episode_reward: -99.768 [-100.000, -73.000] - loss: 3.596 - mae: 89.900 - mean_q: -94.034

Interval 5 (40000 steps performed)
1679 episodes - episode_reward: -99.832 [-100.000, -49.000] - loss: 3.560 - mae: 89.925 - mean_q: -94.065

Interval 6 (50000 steps performed)
1616 episodes - episode_reward: -99.830 [-100.000, -72.000] - loss: 3.538 - mae: 89.952 - mean_q: -94.104

Interval 7 (60000 steps performed)
1634 episodes - episode_reward: -99.747 [-100.000, -73.000] - loss: 3.623 - mae: 89.8

In [20]:
results = dqn.test(env, nb_episodes=150, visualize=False)
print(np.max(results.history['episode_reward']))

Testing for 150 episodes ...
Episode 1: reward: -100.000, steps: 2
Episode 2: reward: -100.000, steps: 2
Episode 3: reward: -100.000, steps: 2
Episode 4: reward: -100.000, steps: 2
Episode 5: reward: -100.000, steps: 2
Episode 6: reward: -100.000, steps: 2
Episode 7: reward: -100.000, steps: 2
Episode 8: reward: -100.000, steps: 2
Episode 9: reward: -100.000, steps: 2
Episode 10: reward: -100.000, steps: 2
Episode 11: reward: -100.000, steps: 2
Episode 12: reward: -100.000, steps: 2
Episode 13: reward: -100.000, steps: 2
Episode 14: reward: -100.000, steps: 2
Episode 15: reward: -100.000, steps: 2
Episode 16: reward: -100.000, steps: 2
Episode 17: reward: -100.000, steps: 2
Episode 18: reward: -100.000, steps: 2
Episode 19: reward: -100.000, steps: 2
Episode 20: reward: -100.000, steps: 2
Episode 21: reward: -100.000, steps: 2
Episode 22: reward: -100.000, steps: 2
Episode 23: reward: -100.000, steps: 2
Episode 24: reward: -100.000, steps: 2
Episode 25: reward: -100.000, steps: 2
Episo

In [10]:
env=TakeItEasyEnv()

episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    action_list = np.arange(1,20)

    len = []

    while not done:
        action = env.action_space.sample()
        print(action)
        n_state, done, reward, info = env.step(action)
        if done:
            len.append(env.round_no)
            print("Episode {} finished after {} timesteps. Reward {}".format(episode, env.round_no, reward))
            break
print(np.average(len))

15
13
13
Episode 1 finished after 3 timesteps. Reward True
5
9
19
18
19
Episode 2 finished after 5 timesteps. Reward True
17
11
1
14
3
9
9
Episode 3 finished after 7 timesteps. Reward True
11
14
17
4
15
2
17
Episode 4 finished after 7 timesteps. Reward True
15
12
15
Episode 5 finished after 3 timesteps. Reward True
15
5
6
19
7
12
12
Episode 6 finished after 7 timesteps. Reward True
16
20


IndexError: index 20 is out of bounds for axis 0 with size 20

In [None]:
pieces = np.array([
    [2,1,3], [2,1,4], [2,1,8], [6,1,3], [6,1,4], [6,1,8], [7,1,3], [7,1,4], [7,1,8], 
    [2,5,3], [2,5,4], [2,5,8], [6,5,3], [6,5,4], [6,5,8], [7,5,3], [7,5,4], [7,5,8],
    [2,9,3], [2,9,4], [2,9,8], [6,9,3], [6,9,4], [6,9,8], [7,9,3], [7,9,4], [7,9,8]])
selected_pieces = rng.choice(pieces, size=19, replace=False)
selected_pieces

array([[2, 5, 4],
       [6, 1, 4],
       [7, 5, 8],
       [2, 9, 8],
       [7, 1, 3],
       [6, 5, 3],
       [6, 1, 3],
       [2, 9, 4],
       [7, 1, 8],
       [6, 9, 8],
       [2, 9, 3],
       [2, 1, 4],
       [6, 9, 4],
       [6, 5, 8],
       [7, 9, 3],
       [6, 9, 3],
       [2, 5, 8],
       [7, 1, 4],
       [6, 5, 4]])

In [None]:
cases = np.array([
    [[1,4,8,0,0], [2,5,9,13,0], [3,6,10,14,17], [7,11,15,18,0], [12,16,19,0,0]],
    [[1,2,3,0,0], [4,5,6,7,0], [8,9,10,11,12], [13,14,15,16,0], [17,18,19,0,0]], 
    [[3,7,12,0,0], [2,6,11,16,0], [1,5,10,15,19], [4,9,14,18,0], [8,13,17,0,0]]       
    ], np.int32)


#state = np.full(shape=(19,3), fill_value=1, dtype=np.int32)
state = np.array([
            [0,9,0], [0,9,0], [0,0,0],
        [0,0,0], [0,0,0], [0,0,0], [0,0,0],
    [0,9,0], [0,9,0], [0,9,0], [0,9,0], [0,0,0],
        [0,0,0], [0,9,0], [0,9,0], [0,9,0],
            [0,0,0], [0,0,0], [0,0,0]
    ])

#state = rng.choice(pieces,19, replace=False)

visualize = f'''
                ____
               /    \                               
          ____/      \____                                    
              {state[7]}
         /    \      /    \                                         
    ____/      \____/      \____                                 
        {state[3]}     {state[12]}                                      
   /    \      /    \      /    \                                       
  /      \____/      \____/      \                                  
  {state[0]}     {state[8]}     {state[16]}                                       
  \      /    \      /    \      /                                                      
   \____/      \____/      \____/                       
        {state[4]}     {state[13]}                                 
   /    \      /    \      /    \                                       
  /      \____/      \____/      \                                      
  {state[1]}     {state[9]}     {state[17]}                                           
  \      /    \      /    \      /                                      
   \____/      \____/      \____/                                       
        {state[5]}     {state[14]}                                             
   /    \      /    \      /    \                                       
  /      \____/      \____/      \                                      
  {state[2]}     {state[10]}    {state[18]}                                           
  \      /    \      /    \      /                                                  
   \____/      \____/      \____/
        {state[6]}     {state[15]}                                                                 
        \      /    \      /                                                        
         \____/      \____/                                                     
              {state[11]}                                                   
              \      /                                                      
               \____/                                                   
''' 

#print(state)

points=0
for i in range(3):
    for j in range(5):
        first_value = state[cases[i][j][0]-1,i]
        for k in range(5):
            if(cases[i][j][k] == 0):
                points += first_value*k
                break
            if(state[cases[i][j][k]-1, i] != first_value):
                break
            elif(k == 4):
                points += first_value*(k+1)
                break

print(points)
print(visualize)

                

In [None]:
state = np.zeros(shape=(20, 3), dtype=np.int32)
state

Box(low=1,high=9,shape=(20,3))

In [None]:

class CardGameEnv(py_environment.PyEnvironment):

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(1,), dtype=np.int32, minimum=0, maximum=1, name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(3,), dtype=np.int32, name='observation')
    self._state = array_spec.BoundedArraySpec(
        shape=(3,), dtype=np.int32, name='state')
    self._pieces = np.array([
            [2,1,3], [2,1,4], [2,1,8], [6,1,3], [6,1,4], [6,1,8], [7,1,3], [7,1,4], [7,1,8], 
            [2,5,3], [2,5,4], [2,5,8], [6,5,3], [6,5,4], [6,5,8], [7,5,3], [7,5,4], [7,5,8],
            [2,9,3], [2,9,4], [2,9,8], [6,9,3], [6,9,4], [6,9,8], [7,9,3], [7,9,4], [7,9,8]])
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._state = 0
    self._episode_ended = False
    return ts.restart(np.array([self._state], dtype=np.int32))
  
  def _count_points(self, state):
    return 1 #pisteitä

  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()

    # Make sure episodes don't go on forever.
    if action == 1:
      self._episode_ended = True
    elif action == 0:
      new_card = np.random.randint(1, 11)
      self._state += new_card
    else:
      raise ValueError('`action` should be 0 or 1.')

    if self._episode_ended or self._state >= 21:
      reward = self._count_points(self._state)**2
      return ts.termination(np.array([self._state], dtype=np.int32), reward)
    else:
      return ts.transition(
          np.array([self._state], dtype=np.int32), reward=0.0, discount=0.0)
        
        


In [None]:
environment = CardGameEnv()
utils.validate_py_environment(environment)