In [None]:
import gym.spaces
import numpy as np
import gym
import tensorflow as tf
from gym.wrappers import Monitor
import cv2
import random
import collections

In [None]:
if True: 
  
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !pip install pyvirtualdisplay > /dev/null 2>&1

    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(1400, 900))
    display.start()

In [None]:
#Создание модели сверточной нейросети
def OurModel(input_shape, n_actions):
    model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, 8, strides=4, input_shape = input_shape, activation='relu'),
    tf.keras.layers.Conv2D(64, 4, strides=2, activation='relu'),
    tf.keras.layers.Conv2D(64, 3, strides=1, activation='relu'),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(n_actions),
    ])
    return model

Объявление классов для подготовительных операций(уменьшение размеров кадра до 84x84 и перевод в черно-белый формат для ускорения обучения.  В классе BufferWrapper прсотранство наблюдений формируется из четырех последовательных фреймов, таким образом, нейросеть будет в состоянии отследить динамику игрового процесса.)

In [None]:
class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        super().__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super().__init__(env)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super().__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=-1),
                                                old_space.high.repeat(n_steps, axis=-1), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:,:,:-1] = self.buffer[:, :, 1:]
        self.buffer[:,:,-1] = observation[:, :, -1]
        return self.buffer


class ImageToKeras(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[0], 
                                old_shape[1], old_shape[-1]), dtype=np.float32)

    def observation(self, observation):
        return observation


class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageToKeras(env)
    env = BufferWrapper(env, 4)
    return ScaledFloatFrame(env)

In [None]:
env = make_env('PongNoFrameskip-v4')

In [None]:
class DQN:
    def __init__(self, env, EPISODES):
        self.env = env
        self.EPISODES = EPISODES
        self.memory_size = 10000
        self.experience = collections.namedtuple('experience', field_names=['s', 'a', 'r', 'done', 's1'])
        self.memory = collections.deque(maxlen=self.memory_size)
        self.gamma = 0.99
        self.eps = 1.0
        self.eps_min = 0.02
        self.eps_decay = 0.00001
        self.BATCH_SIZE = 32
        self.update_model_steps = 1000
        self.model = OurModel(env.observation_space.shape, env.action_space.n)
        self.target_model = OurModel(env.observation_space.shape, env.action_space.n)  
        self.optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001)

    def update_target_model(self, game_steps):
        #Синхронизация основной сети с таргетной
        if game_steps % self.update_model_steps == 0:
            self.target_model.set_weights(self.model.get_weights())


    def act(self, s):
        #Получение действия
        self.eps = self.eps * (1 - self.eps_decay)
        self.eps = max(self.eps, self.eps_min)
        if self.eps > np.random.rand():
            return env.action_space.sample()
        else:
            Q = self.model(np.array([s]))
            a = tf.argmax(Q, axis=1)
            return int(a.numpy()[0])


    def replay(self):
        batch = random.sample(self.memory, self.BATCH_SIZE)
        #Обучение основной нейросети
        indexes = np.random.choice(len(self.memory), self.BATCH_SIZE, replace=False)
        s, a, r, done, s1 = zip(*[self.memory[idx] for idx in indexes])
        s = np.array(s)
        a = np.array(a)
        r = np.array(r, dtype=np.float32)
        done = np.array(done, dtype=np.uint8)
        s1 = np.array(s1)
        with tf.GradientTape() as tape:
       
          Q_s = tf.squeeze(tf.gather(self.model(s), a[:, None], axis = 1, batch_dims=1), axis = -1)
          Q_s1 = tf.reduce_max(self.target_model(s1), axis=1).numpy()
          Q_s1[done] = 0.0
          Q_target = Q_s1 * self.gamma + r
          current_loss = tf.keras.losses.MeanSquaredError()(Q_s, Q_target)   

        grads = tape.gradient(current_loss , self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads , self.model.trainable_weights))


    def run(self):
        step = 0
        e = 0 
        s = self.env.reset()
        score = 0
        while e != self.EPISODES:
            step += 1
            a = self.act(s)
            s1, r, done, _ = self.env.step(a)
            exp = self.experience(s, a, r, done, s1)
            self.memory.append(exp)
            s = s1
            score += r
            if done:
                print('{}: Episode: {}, score:{}, epsilone:{}'.format(step, e, score, self.eps))
                s = self.env.reset()
                e += 1
                score = 0
            if len(self.memory) >= self.memory_size:
                self.replay()
            if step % self.update_model_steps == 0:
                self.target_model.set_weights(self.model.get_weights())

    def show_video(self):
        #Загрузка видео для Colab
        import glob
        import io
        import base64
        from IPython.display import HTML
        from IPython import display as ipythondisplay

        mp4list = glob.glob('video/*.mp4')
        if len(mp4list) > 0:
            mp4 = mp4list[0]
            print(mp4)
            video = io.open(mp4, 'r+b').read()
            encoded = base64.b64encode(video)
            ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
        else: 
            print("Could not find video")


    def test(self):
        self.env = Monitor(self.env, './video', force=True)
        s = self.env.reset()
        score = 0
        done = False
        step = 0
        while not done:

            a = self.act(s)
            s1, r, done, _ = self.env.step(a)
            score += r
            s = s1
            step += 1
            if r != 0:
                print('New score = {}'.format(r))
            if done:        
                break

        self.env.close()
        self.show_video()
        print('Total score = {}'.format(score))

In [None]:
dqn = DQN(env, 300)
dqn.run()

762: Episode: 0, score:-21.0, epsilone:0.9924089207874794
1836: Episode: 1, score:-19.0, epsilone:0.9818074278926047
2598: Episode: 2, score:-21.0, epsilone:0.9743544499360283
3360: Episode: 3, score:-21.0, epsilone:0.9669580481254901
4122: Episode: 4, score:-21.0, epsilone:0.9596177929869822
4972: Episode: 5, score:-21.0, epsilone:0.9514955694880541
5762: Episode: 6, score:-21.0, epsilone:0.9440083305865726
6664: Episode: 7, score:-20.0, epsilone:0.9355316204961915
7533: Episode: 8, score:-21.0, epsilone:0.9274370321668995
8383: Episode: 9, score:-21.0, epsilone:0.9195871872479202
9236: Episode: 10, score:-21.0, epsilone:0.911776429727693
10026: Episode: 11, score:-21.0, epsilone:0.9046017374085439
10848: Episode: 12, score:-21.0, epsilone:0.8971963519320649
11638: Episode: 13, score:-21.0, epsilone:0.8901363890232863
12478: Episode: 14, score:-20.0, epsilone:0.8826905225471939
13677: Episode: 15, score:-20.0, epsilone:0.8721702059122648
14467: Episode: 16, score:-21.0, epsilone:0.865

In [None]:
dqn.test()

New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
New score = 1.0
video/openaigym.video.6.58.video000000.mp4


Total score = 21.0
