<a href="https://colab.research.google.com/github/githyj-jang/Omok_RL/blob/main/Omok_with_TD_Q_Learning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

# 오목 보드 크기와 학습 파라미터 설정
BOARD_SIZE = 15
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 0.1

# Q 테이블 초기화 함수

In [None]:
def initialize_Q():
    Q_table = {}
    for i in range(BOARD_SIZE):
        for j in range(BOARD_SIZE):
            Q_table[(i, j)] = 0.0
    return Q_table

# 행동 선택 함수 (ε-탐욕 정책)

In [None]:
def choose_action(state, available_actions, Q_table):
    if random.uniform(0, 1) < EPSILON:
        return random.choice(available_actions)  # 탐험
    else:
        q_values = [Q_table[action] for action in available_actions]
        return available_actions[np.argmax(q_values)]  # 최적 행동 선택

# Q 값 업데이트 함수

In [None]:
def update_Q(Q_table, state, action, reward, next_state, next_available_actions, done):
    best_next_action = max([Q_table[next_action] for next_action in next_available_actions]) if not done else 0
    Q_table[action] += ALPHA * (reward + GAMMA * best_next_action - Q_table[action])

# 가능한 행동 반환 함수

In [None]:
def get_available_actions(board):
    return [(i, j) for i in range(BOARD_SIZE) for j in range(BOARD_SIZE) if board[i][j] == 0]

# 게임 종료 여부 체크 함수 (승리 또는 무승부)

In [None]:
def is_terminal_state(board):
    return check_win(board, 1) or check_win(board, -1) or all(all(cell != 0 for cell in row) for row in board)

# 승리 조건 체크 함수

In [None]:
def check_win(board, player):
    def check_direction(x, y, dx, dy):
        count = 0
        for _ in range(5):
            if 0 <= x < BOARD_SIZE and 0 <= y < BOARD_SIZE and board[x][y] == player:
                count += 1
            else:
                break
            x += dx
            y += dy
        return count == 5

    for i in range(BOARD_SIZE):
        for j in range(BOARD_SIZE):
            if (check_direction(i, j, 1, 0) or  # 가로
                check_direction(i, j, 0, 1) or  # 세로
                check_direction(i, j, 1, 1) or  # 대각선 (\)
                check_direction(i, j, 1, -1)):  # 대각선 (/)
                return True
    return False

# 연속된 돌의 수 계산 함수

In [None]:
def count_consecutive_stones(board, player, x, y, dx, dy):
    count = 0
    for _ in range(5):
        if 0 <= x < BOARD_SIZE and 0 <= y < BOARD_SIZE and board[x][y] == player:
            count += 1
        else:
            break
        x += dx
        y += dy
    return count

# 보상 계산 함수

In [None]:
def get_reward(board, player, action):
    x, y = action
    reward = 0

    if check_win(board, player):
        return 1  # 승리 시 보상
    if check_win(board, -player):
        return -1  # 패배 시 보상

    directions = [(1, 0), (0, 1), (1, 1), (1, -1)]

    for dx, dy in directions:
        count_player = count_consecutive_stones(board, player, x, y, dx, dy)
        count_opponent = count_consecutive_stones(board, -player, x, y, dx, dy)

        # 연속된 돌의 수에 따라 보상 부여
        if count_player == 4:
            reward += 0.5  # 4개의 연속된 돌을 놓는다면 큰 보상
        elif count_player == 3:
            reward += 0.2  # 3개의 연속된 돌을 놓는다면 보상
        elif count_player == 2:
            reward += 0.1  # 2개의 연속된 돌을 놓는다면 작은 보상

        # 상대방의 연속된 돌을 막는다면 보상
        if count_opponent == 4:
            reward += 0.5
        elif count_opponent == 3:
            reward += 0.2
        elif count_opponent == 2:
            reward += 0.1

    return reward

# 오목 게임 학습

In [None]:
Q_table = initialize_Q()

for episode in range(10000):  # 에피소드 수
    board = np.zeros((BOARD_SIZE, BOARD_SIZE))
    done = False
    player = 1  # 플레이어 1부터 시작

    while not done:
        state = board.copy()
        available_actions = get_available_actions(board)
        action = choose_action(state, available_actions, Q_table)
        board[action] = player

        reward = get_reward(board, player, action)
        done = is_terminal_state(board)
        next_state = board.copy()
        next_available_actions = get_available_actions(board)

        update_Q(Q_table, state, action, reward, next_state, next_available_actions, done)

        state = next_state
        player *= -1  # 플레이어 변경

테스트

---



*   player 1 -> 학습된 모델을 따름 player 2 랜덤 선택
*   player 1 , 2 학습된 모델을 따름



In [None]:
def choose_action_learn(state, available_actions, Q_table):
    q_values = [Q_table[action] for action in available_actions]
    return available_actions[np.argmax(q_values)]

def play_game_1(Q_table):
    board = np.zeros((BOARD_SIZE, BOARD_SIZE))
    done = False
    player = 1  # 플레이어 1부터 시작

    while not done:
        state = board.copy()
        available_actions = get_available_actions(board)

        if player == 1:
            action = choose_action(state, available_actions, Q_table)
        elif player == -1:  # player 2가 랜덤하게 행동
            action = random.choice(available_actions)

        board[action] = player
        done = is_terminal_state(board)
        player *= -1  # 플레이어 변경

    return check_win(board, 1), check_win(board, -1), all(all(cell != 0 for cell in row) for row in board)

# 게임 플레이 및 결과 계산
def evaluate_policy_1(Q_table, num_games):
    win_count_p1 = 0
    win_count_p2 = 0
    draw_count = 0

    for _ in range(num_games):
        p1_win, p2_win, draw = play_game_1(Q_table)
        if p1_win:
            win_count_p1 += 1
        elif p2_win:
            win_count_p2 += 1
        elif draw:
            draw_count += 1

    return win_count_p1, win_count_p2, draw_count

# policy를 따르는 user 1과 랜덤하게 행동하는 user 2의 승률 계산
win_count_p1, win_count_p2, draw_count = evaluate_policy_1(Q_table, num_games=100)
print("모델를 따르는 User 1의 승리 횟수:", win_count_p1)
print("랜덤하게 행동하는 User 2의 승리 횟수:", win_count_p2)
print("무승부 횟수:", draw_count)


모델를 따르는 User 1의 승리 횟수: 100
랜덤하게 행동하는 User 2의 승리 횟수: 0
무승부 횟수: 0


In [None]:
def play_game_2(Q_table):
    board = np.zeros((BOARD_SIZE, BOARD_SIZE))
    done = False
    player = 1  # 플레이어 1부터 시작

    while not done:
        state = board.copy()
        available_actions = get_available_actions(board)

        action = choose_action_learn(state, available_actions, Q_table)

        board[action] = player
        done = is_terminal_state(board)
        player *= -1  # 플레이어 변경

    return check_win(board, 1), check_win(board, -1), all(all(cell != 0 for cell in row) for row in board)

# 게임 플레이 및 결과 계산
def evaluate_policy_2(Q_table, num_games):
    win_count_p1 = 0
    win_count_p2 = 0
    draw_count = 0

    for _ in range(num_games):
        p1_win, p2_win, draw = play_game_2(Q_table)
        if p1_win:
            win_count_p1 += 1
        elif p2_win:
            win_count_p2 += 1
        elif draw:
            draw_count += 1

    return win_count_p1, win_count_p2, draw_count

win_count_p1, win_count_p2, draw_count = evaluate_policy_2(Q_table, num_games=100)
print("모델를 따르는 User 1의 승리 횟수:", win_count_p1)
print("모델를 따르는 User 2의 승리 횟수:", win_count_p2)
print("무승부 횟수:", draw_count)

모델를 따르는 User 1의 승리 횟수: 100
모델를 따르는 User 2의 승리 횟수: 0
무승부 횟수: 0
