### 2048 GAME

In [None]:
import random
import numpy as np
from collections import deque
from keras.models import Sequential 
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers import Conv2D, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from two_oh_four_eight import Game
import os

In [None]:
from tensorflow import keras

keras.utils.disable_interactive_logging()

env = Game(max_moves=100000, max_score=100000)

state_size = env.state_size
action_size = env.action_size

batch_size = 1024
# rounds of play
n_episodes = 10000
output_dir = "model_output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# (4,2)
# 4 represents number of types of state (cart position, cart velocity, pole angle, and pole angular velocity)
# 2 represents the number of types of actions (left and right)
state_size, action_size

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100000)
        self.gamma = 0.95
        self.epsilon = 0.05
        self.epsilon_decay = 0.9999
        self.epsilon_min = 0.00
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()

        model.add(Conv2D(filters=64, kernel_size=(2,2), activation='relu', input_shape=(4, 4, 13)))
        model.add(Conv2D(filters=128, kernel_size=(2,2), activation='relu'))
        model.add(Flatten())

        
        model.add(Dense(64, activation="relu", input_dim=self.state_size, kernel_regularizer=l2(0.01)))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        
        model.add(Dense(64, activation="relu", kernel_regularizer=l2(0.01)))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        
        model.add(Dense(64, activation="relu", kernel_regularizer=l2(0.01)))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        
        # Output layer
        model.add(Dense(self.action_size, activation="linear"))
        
        model.compile(loss="mse", optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def train(self, batch_size):
        minibatch = random.sample(self.memory, min(batch_size, len(self.memory)))
        
        batched_inputs = []
        batched_targets = []
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                pred = self.model.predict(next_state)[0]
                target = reward + self.gamma * np.amax(pred)
            
            target_f = self.model.predict(state)[0]
            target_f[action] = target
            
            batched_inputs.append(state[0])
            batched_targets.append(target_f)
        
        # Convert lists to numpy arrays
        batched_inputs = np.array(batched_inputs)
        batched_targets = np.array(batched_targets)
        
        # Single fit call for the entire batch
        self.model.fit(batched_inputs, batched_targets, epochs=1, verbose=False)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def act(self, state):
        if np.random.rand() < self.epsilon:
            # if random number smaller than epsilon, return an 'exploratory' action
            return random.randrange(self.action_size)
        # otherwise, use the predicted move
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def save(self, name):
        self.model.save(f"{output_dir}/{name}")
        self.model.save_weights(f"{output_dir}/{name}_weights.h5")



In [None]:
from IPython.display import clear_output
   
agent = DQNAgent(state_size, action_size)
best_score = 0
try:
    for index_episode in range(n_episodes):
        state = env.reset()
        done = False
        invalid_moves = 0
        invalid_move_first = False   
        while not done:
            clear_output(wait=True)
            env.render()
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            print(f"Action: {action}, Reward: {reward}")
            agent.remember(state, action, reward, next_state, done)
            if not np.array_equal(state, next_state):
                invalid_moves += 1
            state = next_state
        print(f"Episode #{index_episode} ended with score: {env.score} and {invalid_moves} invalid moves\n")
        agent.train(1024)
        if env.score > best_score:
            best_score = env.score
            agent.save(f"2048_{best_score}_{invalid_moves}_{env.highest_tile}.h5")
finally:
    agent.save("2048_last.h5")

