In [5]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from collections import deque
import numpy as np
import pandas as pd
import seaborn as sns
import random
import pickle
import glob

import PIL
from PIL import Image
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import wandb

%aimport util.decaying
%aimport dqnmodel

In [7]:
MEM_SIZE = 8000
MIN_MEM_SIZE = 1000

DISCOUNT_START = 0.8
DISCOUNT_END = 0.94
DISCOUNT_DURATION = 4000

EPSILON_START = 0.5
EPSILON_END = 0.03
EPSILON_DURATION = 4000

SIMULATE_EVERY = 4
USE_PRO_PLAY_CHANCE = 0.2

EPISODES = 50
BATCH_SIZE = 164
LEARNING_RATE = 5e-5

In [9]:
%run tetris-environment.ipynb

          
          
          
          
          
          
          
          
          
          
          
          
          
          
█████     
████      
████ █████
████  ████
████ █████
██████████


In [10]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device
device = 'cuda'

use_pro_replays = True

In [13]:
model = dqnmodel.DQNModel(50)
model.to(device)

In [19]:
epsilon = util.decaying.DecayingDiscount(EPSILON_START, EPSILON_END, EPSILON_DURATION)
discount = util.decaying.DecayingLinear(DISCOUNT_START, DISCOUNT_END, DISCOUNT_DURATION)

In [23]:
def get_best_state(states, use_epsilon=True):
    if not use_epsilon or random.random() > epsilon.get():
        # use the q-network (not the target network) for chosing the next state
        q_values = model.model(states)
        return torch.argmax(q_values)
    else:
        return random.choice(range(len(states)))

In [21]:
env = TetrisEnv()

### Fill the replay buffer by playing games

In [24]:
replay_buffer = []

def to_torch(state):
    return torch.from_numpy(states.reshape(states.shape[0], -1)).float()

with tqdm(total=MIN_MEM_SIZE/20) as pbar:
    while len(replay_buffer) < MIN_MEM_SIZE:
        env.reset()
        pbar.update(1)

        # play moves until game over
        while True:
            states, scores, clears, dones = env.get_next_states()

            chosen_index = get_best_state(torch.from_numpy(states.reshape(-1, 200)).float().to(device))

            replay_buffer.append((env.get_current_state(), states[chosen_index], scores[chosen_index], dones[chosen_index]))

            if dones[chosen_index]:
                break
            else:
                env.step(states[chosen_index], clears[chosen_index], scores[chosen_index])

# states, _, _, _, = env.get_next_states()
# states = torch.from_numpy(states.reshape(states.shape[0], -1)).float()

# model(states)

  0%|          | 0/50.0 [00:00<?, ?it/s]

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [17, 200]

### Fill the replay buffer from professional player moves

In [12]:
class ProReplayDataset(torch.utils.data.Dataset):
    def __init__(self, path, train):
        self.path = path
        self.file_list = glob.glob(f'{path}\\*.json')
        self.train = train
        
        def string_to_board(string):
            return torch.tensor([int(c) for c in list(string)])
        
        # to speed things up, load the whole dataset to memory
        self.buffer = []
        for idx, file in tqdm(enumerate(self.file_list)):
            df = pd.read_csv(file)

            for index, row in df.iterrows():
                self.buffer.append((
                    string_to_board(row.current).reshape((20, 10)).to(device),
                    string_to_board(row.next).reshape((20, 10)).to(device),
                    row.score,
                    row.done
                ))
      
    def __len__(self):
        return len(self.buffer)

    def __getitem__(self, idx):
        return self.buffer[idx]


replay_buffer_dataset = ProReplayDataset('I:\\AI\\processed-dqn', True)
replay_buffer_loader = torch.utils.data.DataLoader(dataset=replay_buffer_dataset, batch_size=BATCH_SIZE, shuffle=True)

0it [00:00, ?it/s]

### Training loop

In [14]:
wandb.init(project='tetris-dqn', config={
    'learning-rate': LEARNING_RATE,
    'batch-size': BATCH_SIZE,
    
    'replay-max-size': MEM_SIZE,
    'replay-min-size': MIN_MEM_SIZE,
    
    'epsilon-start': EPSILON_START,
    'epsilon-end': EPSILON_END,
    'epsilon-duration': EPSILON_DURATION,
    
    'discount-start': DISCOUNT_START,
    'discount-end': DISCOUNT_END,
    'discount-duration': DISCOUNT_DURATION,
    
    'pro-play-chance': USE_PRO_PLAY_CHANCE,
    'simulate-every': SIMULATE_EVERY,

    # 'resume': True
})

[34m[1mwandb[0m: Currently logged in as: [33mfischly[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [23]:
# %load_ext line_profiler
# %prun train(replay_buffer)

epsilon = util.decaying.DecayingDiscount(EPSILON_START, EPSILON_END, EPSILON_DURATION)
discount = util.decaying.DecayingLinear(DISCOUNT_START, DISCOUNT_END, DISCOUNT_DURATION)


In [131]:
# if use_pro_replays:
# replay_buffer = []
# replay_buffer_iter = iter(replay_buffer_loader)
# discount.current = 0.965

In [132]:
# def train():
criterion = nn.HuberLoss()
optimizer = torch.optim.AdamW(model.model.parameters(), lr=LEARNING_RATE)

training_loss = []
training_scores = []

for episode in tqdm(range(2000)): # tqdm(range(EPISODES)):
    use_pro_replays = random.random() < USE_PRO_PLAY_CHANCE
    
    # play another game
    if episode % SIMULATE_EVERY == 1:
        env.reset()
        while True:
            states, scores, clears, dones = env.get_next_states()

            chosen_index = get_best_state(torch.from_numpy(states.reshape(-1, 200)).float().to(device))

            replay_buffer.append((env.get_current_state(), states[chosen_index], scores[chosen_index], dones[chosen_index]))

            if dones[chosen_index]:
                training_scores.append({'epoch': episode, 'score': env.score})
                break
            else:
                env.step(states[chosen_index], clears[chosen_index], scores[chosen_index])

        if len(replay_buffer) > MEM_SIZE:
            replay_buffer = replay_buffer[int(MEM_SIZE/10):]
            
        wandb.log({'game/score': env.score,
                   'game/singles': env.clears[0], 'game/doubles': env.clears[1], 'game/triples': env.clears[2], 'game/quads': env.clears[3],
                   'game/tspins': env.tspins, 'game/all_clears': env.all_clears, 'game/moves': env.moves })


    # get the batch, consisting of (current_state, next_state, score, done), and extract current and next states
    if use_pro_replays:
        batch = next(replay_buffer_iter)
        current_states = batch[0].reshape(-1, 200).float()
        next_states = batch[0].reshape(-1, 200).float()
    else:
        # take sample from replay memory
        batch = random.sample(replay_buffer, BATCH_SIZE)

        current_states = torch.from_numpy(np.array([s[0].reshape(200) for s in batch])).float().to(device)
        next_states = torch.from_numpy(np.array([s[1].reshape(200) for s in batch])).float().to(device)

    # get the q-values of the current state
    y_hat = model.model(current_states)

    # calculate expected q-values of the next state using the target-network
    next_q_values = model.target_model(next_states)
    y = []
    if use_pro_replays:
        for i in range(batch[3].shape[0]):
            done = batch[3][i]
            score = batch[2][i].float()
            
            if not done:
                new_q = score + discount.current * next_q_values[i]
            else:
                new_q = score
            
            y.append(new_q)
    else:
        for i, (_, _, score, done) in enumerate(batch):
            if not done:
                new_q = score + discount.current * next_q_values[i]
            else:
                new_q = score

            y.append(new_q)

    
            
    # fit the model to the expected q value
    loss = criterion(y_hat, torch.tensor(y).reshape(BATCH_SIZE if not use_pro_replays else batch[3].shape[0], 1).to(device))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epsilon.step()
    discount.step()
    
    model.step()

    wandb.log({'training/loss': loss.item()})
    training_loss.append({'epoch': episode, 'loss': loss.item()})
        # print(loss)
        
# sns.lineplot(data=pd.DataFrame(training_loss), x='epoch', y='loss')
# sns.lineplot(data=pd.DataFrame(training_scores), x='epoch', y='score')

  0%|          | 0/2000 [00:00<?, ?it/s]

In [80]:
# sns.lineplot(data=pd.DataFrame(training_scores), x='epoch', y='score')
print(discount.current)
print(epsilon.current)

0.95
0.030000000000020254


In [129]:

# env.reset()
# print(env.current_piece)

# while True:
#     states, scores, clears, dones = env.get_next_states()

#     chosen_index = get_best_state(torch.from_numpy(states.reshape(-1, 200)).float())

#     env._print_state(states[chosen_index])
#     print()
    
#     # replay_buffer.append((env.get_current_state(), states[chosen_index], scores[chosen_index], dones[chosen_index]))

#     if dones[chosen_index]:
#         print(f'Score: {env.score}')
#         print(f'Clears: {env.clears}, t-spins: {env.tspins}, alll_clears: {env.all_clears}')
#         break
#     else:
#         env.step(states[chosen_index], clears[chosen_index], scores[chosen_index])

In [144]:
%matplotlib inline

COLORS = {
    0: (0,0,0),
    1: (255,255,255)
    # 0: (128, 0, 128),
    # 1: (255, 127, 0),
    # 2: (0, 0, 255),
    # 3: (255, 255, 0),
    # 4: (0, 255, 255),
    # 5: (0, 255, 0),
    # 6: (255, 0, 0)
}

def render_gif(states, file_name):
    
    def gen_image(state):
        img = [COLORS[cell] for row in state for cell in row]
        img = np.array(img).reshape(20, 10, 3).astype(np.uint8)
        img = img[..., ::-1] # Convert RRG to BGR (used by cv2)
        img = Image.fromarray(img, 'RGB')

        img = img.resize((10 * 25, 20 * 25), Image.Resampling.NEAREST)

        return img

    frames = []
    for state in states:
        frames.append(gen_image(state))
    
    frames[0].save(f'images/{file_name}', format='GIF', append_images=frames, save_all=True, duration=300, loop=0)


runs = []
def render_run():
    
    env.reset()
    states_to_render = []
    
    while True:
        states, scores, clears, dones = env.get_next_states()

        chosen_index = get_best_state(torch.from_numpy(states.reshape(-1, 200)).float(), False)

        states_to_render.append(states[chosen_index])

        if dones[chosen_index]:
            print(f'Score: {env.score}')
            print(f'Clears: {env.clears}, t-spins: {env.tspins}, alll_clears: {env.all_clears}')
            break
        else:
            env.step(states[chosen_index], clears[chosen_index], scores[chosen_index])
    
    runs.append(states_to_render)
    
for i in range(10):
    render_run()

Score: 750
Clears: [7, 0, 0, 0], t-spins: 0, alll_clears: 0
Score: 3350.0
Clears: [19, 2, 0, 0], t-spins: 2, alll_clears: 0
Score: 700
Clears: [5, 1, 0, 0], t-spins: 0, alll_clears: 0
Score: 1500
Clears: [11, 1, 0, 0], t-spins: 0, alll_clears: 0
Score: 1575.0
Clears: [10, 0, 0, 0], t-spins: 1, alll_clears: 0
Score: 1750
Clears: [12, 2, 0, 0], t-spins: 0, alll_clears: 0
Score: 700
Clears: [7, 0, 0, 0], t-spins: 0, alll_clears: 0
Score: 950
Clears: [9, 0, 0, 0], t-spins: 0, alll_clears: 0
Score: 1750
Clears: [10, 2, 1, 0], t-spins: 0, alll_clears: 0
Score: 1050
Clears: [8, 1, 0, 0], t-spins: 0, alll_clears: 0


In [145]:
render_gif(runs[1], 'tetris4.gif')

In [133]:
torch.save(model.state_dict(), 'models/run-13.pt')

In [87]:
for p in model.parameters():
    print(torch.min(p))

tensor(-2.4596, grad_fn=<MinBackward1>)
tensor(-0.9907, grad_fn=<MinBackward1>)
tensor(-3.4248, grad_fn=<MinBackward1>)
tensor(-0.9417, grad_fn=<MinBackward1>)
tensor(-4.2111, grad_fn=<MinBackward1>)
tensor(-0.3575, grad_fn=<MinBackward1>)
tensor(-1.9662, grad_fn=<MinBackward1>)
tensor(-0.3889, grad_fn=<MinBackward1>)
