In [16]:
import numpy as np

class connectX :
  def __init__(self , rows=6,columns=7,connect = 4):
    self.rows = rows
    self.columns = columns
    self.connect = connect
    self.board = np.zeros((rows,columns),dtype=int)
    self.current_player = 1

  def reset(self):
    self.board = np.zeros((self.rows,self.columns),dtype=int)
    self.current_player = 1
    return self.board

  def dropdisc(self,column):
    for row in range(self.rows-1,-1,-1):
      if self.board[row][column] == 0:
        self.board[row][column] = self.current_player
        return True
    return False

  def is_winning_move(self, player):
        # Check horizontal, vertical, and diagonal combinations
        # Horizontal
        for row in range(self.rows):
            for col in range(self.columns - self.connect + 1):
                if all(self.board[row, col + i] == player for i in range(self.connect)):
                    return True

        # Vertical
        for col in range(self.columns):
            for row in range(self.rows - self.connect + 1):
                if all(self.board[row + i, col] == player for i in range(self.connect)):
                    return True

        # Diagonal \
        for row in range(self.rows - self.connect + 1):
            for col in range(self.columns - self.connect + 1):
                if all(self.board[row + i, col + i] == player for i in range(self.connect)):
                    return True

        # Diagonal /
        for row in range(self.connect - 1, self.rows):
            for col in range(self.columns - self.connect + 1):
                if all(self.board[row - i, col + i] == player for i in range(self.connect)):
                    return True

        return False

  def is_draw(self):
        return not np.any(self.board == 0)

  def switch_player(self):
        self.current_player = 3 - self.current_player  # Switch between 1 and 2

  def get_valid_moves(self):
        return [col for col in range(self.columns) if self.board[0, col] == 0]

  def render(self):
        print(np.flip(self.board, 0))





In [17]:
import random
from collections import defaultdict

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = defaultdict(lambda: np.zeros(7))  # State-action values

    def choose_action(self, state, valid_moves):
        if random.uniform(0, 1) < self.epsilon:
            # Exploration
            return random.choice(valid_moves)
        else:
            # Exploitation
            q_values = self.q_table[state]
            q_values = np.array([q_values[a] if a in valid_moves else -np.inf for a in range(7)])
            return np.argmax(q_values)

    def update_q_value(self, state, action, reward, next_state, done):
        q_current = self.q_table[state][action]
        if done:
            q_target = reward
        else:
            q_target = reward + self.gamma * np.max(self.q_table[next_state])

        self.q_table[state][action] += self.alpha * (q_target - q_current)

    def get_state(self, board):
        return tuple(map(tuple, board))



In [18]:
def train_agent(episodes=10000):
    env = connectX()
    agent = QLearningAgent()

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            state_repr = agent.get_state(state)
            valid_moves = env.get_valid_moves()
            action = agent.choose_action(state_repr, valid_moves)
            env.dropdisc(action)

            if env.is_winning_move(env.current_player):
                reward = 1
                done = True
            elif env.is_draw():
                reward = 0
                done = True
            else:
                reward = 0
                env.switch_player()

            next_state = state
            next_state_repr = agent.get_state(next_state)

            agent.update_q_value(state_repr, action, reward, next_state_repr, done)

            if not done:
                env.switch_player()
                state = next_state

        if (episode + 1) % 1000 == 0:
            print(f"Episode {episode + 1}/{episodes}")

train_agent()


Episode 1000/10000
Episode 2000/10000
Episode 3000/10000
Episode 4000/10000
Episode 5000/10000
Episode 6000/10000
Episode 7000/10000
Episode 8000/10000
Episode 9000/10000
Episode 10000/10000


In [20]:
def evaluate_agent(episodes=1000):
    env = connectX()
    agent = QLearningAgent()
    agent.epsilon = 0  # Set exploration to zero for evaluation
    wins = 0
    draws = 0

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            state_repr = agent.get_state(state)
            valid_moves = env.get_valid_moves()
            action = agent.choose_action(state_repr, valid_moves)
            env.dropdisc(action)

            if env.is_winning_move(env.current_player):
                if env.current_player == 1:
                    wins += 1
                done = True
            elif env.is_draw():
                draws += 1
                done = True
            else:
                env.switch_player()

    print(f"Wins: {wins}, Draws: {draws}, Losses: {episodes - wins - draws}")

evaluate_agent()


Wins: 1000, Draws: 0, Losses: 0
