In [1]:
import numpy as np
import random
import pickle

def gameBoard(rows=6, columns=7):
    return np.zeros((rows, columns))

def isLegal(board, column):
    return board[len(board)-1][column] == 0

def availableRow(board, column):
    return next((row for row in range(len(board)) if board[row][column] == 0), None)

def legalMove(board):
    return [column for column in range(board.shape[1]) if isLegal(board, column)]


    
def findAvailablePosition(board, marker):
    rows, cols = board.shape
    for row, row_vals in enumerate(board):
        for col, col_val in enumerate(row_vals[:-3]):
            if all(elem == marker for elem in row_vals[col:col+4]):
                return row, col
        for col, col_vals in zip(range(cols), (board[r][col] for r in range(row, min(row+4, rows)))):
            if all(elem == marker for elem in col_vals):
                return row, col
        for col, col_vals in enumerate(row_vals[:-3]):
            if row < rows-3 and col < cols-3:
                diag_vals = [board[row+i][col+i] for i in range(4)]
                if all(elem == marker for elem in diag_vals):
                    return row, col
        for col, col_vals in enumerate(row_vals[:-3]):
            if row >= 3 and col < cols-3:
                diag_vals = [board[row-i][col+i] for i in range(4)]
                if all(elem == marker for elem in diag_vals):
                    return row, col
    else:
        return -1, -1

def checkWin(board, marker):
    rows, cols = board.shape
    for row in range(rows):
        for col in range(cols - 3):
            if all(board[row][col + i] == marker for i in range(4)):
                return True

    for row in range(rows - 3):
        for col in range(cols):
            if all(board[row + i][col] == marker for i in range(4)):
                return True

    for row in range(rows - 3):
        for col in range(cols - 3):
            if all(board[row + i][col + i] == marker for i in range(4)):
                return True

    for row in range(3, rows):
        for col in range(cols - 3):
            if all(board[row - i][col + i] == marker for i in range(4)):
                return True

    return False


def isFinLegal(board, defaultmark, minmaxmark):
    return any(checkWin(board, marker) for marker in (defaultmark, minmaxmark)) or not legalMove(board)

def defaultPlayerMove(gamess, board, defaultmark, minmaxmark):
    if isFinLegal(board, defaultmark, minmaxmark):
        defaultrow, defaultcol = findAvailablePosition(board, defaultmark)
        if defaultrow != -1:
            return defaultrow, defaultcol
        else:
            mmrow, mmcol = findAvailablePosition(board, minmaxmark)
            if mmrow != -1:
                return mmrow, mmcol
            else:
                availablemoves = legalMove(board)
                randrow = availableRow(board, random.choice(availablemoves))
                randcol = random.choice(availablemoves)
                return randrow, randcol
    else:
        availablemoves = legalMove(board)
        randrow = availableRow(board, random.choice(availablemoves)) 
        randcol = random.choice(availablemoves)

        return randrow, randcol

def findPos(positions):
    return int(''.join([str(int(position)) for position in positions.flatten()]))

def qlearnVal(ql_states, crr_grid, crr_pos):
    position = findPos(crr_grid)
    if position not in ql_states:
        ql_states[(position, crr_pos)] = 0
    return ql_states[(position, crr_pos)]

def qlearnBestPos(ql_states, crr_grid, availablemoves, epsilon):
    return random.choice(availablemoves) if random.random() < epsilon else max([(qlearnVal(ql_states, crr_grid, position), position) for position in availablemoves], key=lambda x: x[0])[1]

def modifyqLearn(ql_states, crr_grid, crr_pos, reward, nextgrid, availablemoves, alpha=0.1, gamma=0.99):
    qvalb = max([qlearnVal(ql_states, nextgrid, nextpos) for nextpos in availablemoves], default=0)
    qvalopti = qlearnVal(ql_states, crr_grid, crr_pos) + alpha * ((reward + gamma * qvalb) - qlearnVal(ql_states, crr_grid, crr_pos))
    position = findPos(crr_grid)
    ql_states[(position, crr_pos)] = qvalopti

def modifyExploration(epsilon):
    return max(epsilon * 0.999, 0.1)

def dumpModel(ql_states, file_path="connect4_QL_500k.pickle"):
    with open(file_path, "wb") as file:
        pickle.dump(ql_states, file)

def trainQLearn():
    ql_states = {}
    ql_win = def_win = draw = 0
    qlmarkr = 1
    defaultmark = 2
    total_episodes = 500000
    
    for episode in range(total_episodes):
        board = gameBoard()
        epsilon = 1.0

        while True:
            ql_availablemoves = legalMove(board)

            if len(ql_availablemoves) == 0:
                break

            qlCol = qlearnBestPos(ql_states, board, ql_availablemoves, epsilon)
            qlRow = availableRow(board, qlCol)
            board[qlRow][qlCol] = qlmarkr

            if checkWin(board, qlmarkr):
                ql_win += 1
                modifyqLearn(ql_states, board, qlCol, 1, board, [])
                break
            elif checkWin(board, defaultmark):
                def_win += 1
                modifyqLearn(ql_states, board, qlCol, -1, board, [])
                break
            elif len(legalMove(board)) == 0:
                draw += 1
                modifyqLearn(ql_states, board, qlCol, 0, board, [])
                break
            else:
                modifyqLearn(ql_states, board, qlCol, 0, board, legalMove(board))

            defRow, defCol = defaultPlayerMove(board, board, defaultmark, qlmarkr)
            board[defRow][defCol] = defaultmark

            if checkWin(board, qlmarkr):
                ql_win += 1
                modifyqLearn(ql_states, board, defCol, 1, board, [])
                break
            elif checkWin(board, defaultmark):
                def_win += 1
                modifyqLearn(ql_states, board, defCol, -1, board, [])
                break
            elif len(legalMove(board)) == 0:
                draw += 1
                modifyqLearn(ql_states, board, defCol, 0, board, [])
                break
            else:
                modifyqLearn(ql_states, board, defCol, 0, board, legalMove(board))

            epsilon = modifyExploration(epsilon)

        if episode % 1000 == 0:
            print(f"Episode {episode}: QLearning wins - {ql_win}, Default Player wins - {def_win}, Draws - {draw}")

    return ql_states, ql_win, def_win, draw, total_episodes

ql_states, ql_win, def_win, draw, total_episodes = trainQLearn()

print(f"QLearning wins: {ql_win}")
print(f"Default Player wins: {def_win}")
print(f"Draws: {draw}")
print(f"Total Episodes: {total_episodes}")

dumpModel(ql_states)



Episode 0: QLearning wins - 1, SI Agent wins - 0, Draws - 0
Episode 1000: QLearning wins - 672, SI Agent wins - 317, Draws - 12
Episode 2000: QLearning wins - 1345, SI Agent wins - 631, Draws - 25
Episode 3000: QLearning wins - 2017, SI Agent wins - 945, Draws - 39
Episode 4000: QLearning wins - 2663, SI Agent wins - 1279, Draws - 59
Episode 5000: QLearning wins - 3326, SI Agent wins - 1598, Draws - 77
Episode 6000: QLearning wins - 3986, SI Agent wins - 1925, Draws - 90
Episode 7000: QLearning wins - 4613, SI Agent wins - 2282, Draws - 106
Episode 8000: QLearning wins - 5270, SI Agent wins - 2610, Draws - 121
Episode 9000: QLearning wins - 5908, SI Agent wins - 2959, Draws - 134
Episode 10000: QLearning wins - 6551, SI Agent wins - 3298, Draws - 152
Episode 11000: QLearning wins - 7183, SI Agent wins - 3650, Draws - 168
Episode 12000: QLearning wins - 7843, SI Agent wins - 3981, Draws - 177
Episode 13000: QLearning wins - 8498, SI Agent wins - 4310, Draws - 193
Episode 14000: QLearnin