#### **Final Project**


#### **Environment Code**

In [8]:
import gym
from gym import spaces
import numpy as np

class BattleshipEnv(gym.Env):
    def __init__(self):
        super(BattleshipEnv, self).__init__()
        self.grid_size = 10
        self.ships = [5, 4, 3, 2, 2]  # Ship sizes
        self.action_space = spaces.Discrete(self.grid_size * self.grid_size)  # 100 possible actions
        self.observation_space = spaces.Box(low=0, high=3, shape=(self.grid_size, self.grid_size), dtype=np.int32)
        self.reset()

    def reset(self):
        # Initialize the grid: 0 = empty, 1 = ship, 2 = hit, 3 = miss
        self.grid = np.zeros((self.grid_size, self.grid_size), dtype=np.int32)
        self.ship_positions = self.place_ships()
        self.hits = set()
        self.misses = set()
        return self.grid.copy()

    def place_ships(self):
        positions = []
        for ship in self.ships:
            placed = False
            while not placed:
                orientation = np.random.choice(['horizontal', 'vertical'])
                if orientation == 'horizontal':
                    x = np.random.randint(0, self.grid_size)
                    y = np.random.randint(0, self.grid_size - ship + 1)
                    if all(self.grid[x, y + i] == 0 for i in range(ship)):
                        for i in range(ship):
                            self.grid[x, y + i] = 1
                        positions.append((x, y, orientation, ship))
                        placed = True
                else:
                    x = np.random.randint(0, self.grid_size - ship + 1)
                    y = np.random.randint(0, self.grid_size)
                    if all(self.grid[x + i, y] == 0 for i in range(ship)):
                        for i in range(ship):
                            self.grid[x + i, y] = 1
                        positions.append((x, y, orientation, ship))
                        placed = True
        return positions

    def step(self, action):
        x = action // self.grid_size
        y = action % self.grid_size
        if (x, y) in self.hits or (x, y) in self.misses:
            return self.grid.copy(), -1, False, {}  # Invalid move

        if self.grid[x, y] == 1:  # Hit
            self.grid[x, y] = 2
            self.hits.add((x, y))
            reward = 1
        else:  # Miss
            self.grid[x, y] = 3
            self.misses.add((x, y))
            reward = -1

        done = len(self.hits) == sum(self.ships)  # All ships sunk
        return self.grid.copy(), reward, done, {}

    def render(self, mode='human'):
        for row in self.grid:
            print(" ".join(str(cell) for cell in row))

#### **Q-Learning Code**

#### **Training**

In [9]:
import numpy as np

class QLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.env = env
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.q_table = np.zeros((env.grid_size * env.grid_size, env.grid_size * env.grid_size))

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:  # Explore
            return np.random.choice(self.env.action_space.n)
        else:  # Exploit
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        self.q_table[state, action] += self.alpha * (
            reward + self.gamma * self.q_table[next_state, best_next_action] - self.q_table[state, action]
        )

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

# Thompson Sampling for exploration
class ThompsonSamplingAgent:
    def __init__(self, env):
        self.env = env
        self.successes = np.ones((env.grid_size, env.grid_size))
        self.failures = np.ones((env.grid_size, env.grid_size))

    def choose_action(self):
        samples = np.random.beta(self.successes, self.failures)
        x,y = np.unravel_index(np.argmax(samples), (self.env.grid_size, self.env.grid_size))
        return x * self.env.grid_size + y

    def update(self, action, hit):
        if hit:
            self.successes[action] += 1
        else:
            self.failures[action] += 1


env = BattleshipEnv()
exploration_agent = QLearningAgent(env)
exploitation_agent = ThompsonSamplingAgent(env)

# Training loop
num_episodes = 10
for episode in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        # Exploration phase
        action = exploration_agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        exploration_agent.update_q_table(state, action, reward, next_state)
        exploration_agent.decay_epsilon()

        # # Exploitation phase
        # if reward == 1:  # If hit, use Thompson Sampling
        #     action = exploitation_agent.choose_action()
        #     next_state, reward, done, _ = env.step(action)
        #     exploitation_agent.update(action, reward == 1)

        # if next_state > 100:
        #     print(f"next : {next_state} reward: {reward} done: {done}")
        state = next_state

    if episode % 100 == 0:
        print(f"Episode {episode}, Exploration Epsilon: {exploration_agent.epsilon}")

# Save the trained models
np.save("exploration_q_table.npy", exploration_agent.q_table)
np.save("exploitation_successes.npy", exploitation_agent.successes)
np.save("exploitation_failures.npy", exploitation_agent.failures)

Episode 0, Exploration Epsilon: 0.47147873742168567



KeyboardInterrupt

