# Connect4 Data Generation Script
This notebook generates datasets for Connect4 using Monte Carlo Tree Search (MCTS).

## Import Required Libraries
The following libraries are used for data generation, MCTS, and file handling.

In [1]:
import random
import pandas as pd
from copy import deepcopy
from mcts_ucb import MctsAlgo, Node
from connect4API import Connect4
import math
import time
import os

ImportError: cannot import name 'MctsAlgo' from 'mcts_ucb' (c:\Users\arcan\OneDrive\Ambiente de Trabalho\My apps\python\connect4Project\other_version\src(in construction)\mcts_ucb.py)

## Helper Functions
These functions assist in creating and manipulating the Connect4 board, as well as generating valid game states.

In [None]:
def create_empty_board() -> list[list[str]]:
    return [['-' for _ in range(COLS)] for _ in range(ROWS)]

def make_random_move(board: list[list[str]], col: int, player: str) -> bool:
    for row in reversed(range(ROWS)):
        if board[row][col] == '-':
            board[row][col] = player
            return True
    return False

def generate_valid_board(num_moves: int) -> (list[list[str]], str):
    board = create_empty_board()
    current_player = 'O'
    move_count = 0
    while move_count < num_moves:
        col = random.randint(0, COLS - 1)
        if make_random_move(board, col, current_player):
            move_count += 1
            current_player = 'X' if current_player == 'O' else 'O'
    return board, current_player

def flatten_board(board: list[list[str]]) -> list[str]:
    return [cell for row in board for cell in row]

def display_progress_bar(progress: int, total: int, bar_length: int = 100):
    completed = int(bar_length * progress / total)
    bar = '#' * completed + '-' * (bar_length - completed)
    print(f"\r/{bar}/ {round(progress / total * 100, 1)}%", end='', flush=False)

def generate_dataset(createFor: list[str]) -> dict[str, list[list[str]]]:
    datasets = {"early": [],
                "mid": [],
                "late": []}
    mcts = MctsAlgo(C=C0, reset=resetTree0, drawValue=drawValue0)
    connect4 = Connect4(6, 7)
    for gamestate, (min_moves, max_moves) in GAMESTATES.items():
        if gamestate not in createFor:
            continue
        count = 0
        while count < ITERATIONS_PER_GAMESTATE:
            display_progress_bar(count, ITERATIONS_PER_GAMESTATE)
            num_moves = random.randint(min_moves, max_moves)
            board, turn, row, col = generate_valid_board(num_moves)
            connect4.getIntoDesiredState(turn, gameState=board)

            if connect4.checkGameOver():  # IGNORA estados terminados
                continue

            # MCTS para determinar resultado provável
            mcts.run_mcts(iterations0, connect4)
            bestMove = str(mcts.choose_best_move())

            flat_board = flatten_board(board)
            flat_board.append(bestMove)
            datasets[gamestate].append(flat_board)
            count += 1
    return datasets

def save_to_excel(datasets: dict[str, list[list[str]]]):
    col_names = [f"cell_{i}" for i in range(42)] + ["result"]
    for gamestate in datasets.keys():
        df = pd.DataFrame(dataset, columns=col_names)
        filename = f"connect4_{gamestate}.xlsx"
        df.to_excel(filename, index=False)
        print(f"Dataset salvo como {filename}")

## Configuration Parameters
Define constants and parameters for MCTS and game state distribution.

In [3]:
C0 = math.sqrt(2)
iterations0 = 15000
resetTree0 = True
drawValue0 = 0

ROWS = 6
COLS = 7

# Distribuição sugerida
GAMESTATES = {
    "early": (4, 14),   # evitamos jogos com menos de 4 peças
    "mid": (15, 28),
    "late": (29, 41)   # até 41 para garantir que não é empate (42)
}

# Cada membro do grupo faz 3333
ITERATIONS_PER_GAMESTATE = 3333

## Generate Dataset for Early Game States
This section generates datasets for early game states and saves them to Excel.

In [4]:
start = time.time()
datasets = generate_dataset(createFor=["early"])
end = time.time()
print(f"Run time: {round(end - start, 4)/60}m")
save_to_excel(datasets)

/----------------------------------------------------------------------------------------------------/ 0.0%

TypeError: list indices must be integers or slices, not NoneType

## Generate Dataset for Mid Game States
This section generates datasets for mid game states and saves them to Excel.

In [None]:
start = time.time()
datasets = generate_dataset(createFor=["mid"])
end = time.time()
print(f"Run time: {round(end - start, 4)/60}m")
save_to_excel(datasets)

## Generate Dataset for Late Game States
This section generates datasets for late game states and saves them to Excel.

In [None]:
start = time.time()
datasets = generate_dataset(createFor=["late"])
end = time.time()
print(f"Run time: {round(end - start, 4)/60}m")
save_to_excel(datasets)

## Check for Duplicate Rows in the Dataset
This cell loads the generated dataset from an Excel file, checks for duplicate rows (excluding the 'result' column), and displays the count and a sample of duplicate rows.

In [None]:
# Load the Excel file
file_path_early = r"C:\Users\arcan\OneDrive\Ambiente de Trabalho\My apps\python\connect4Project\other_version\datasets\connect4_early.xlsx"
file_path_mid = r"C:\Users\arcan\OneDrive\Ambiente de Trabalho\My apps\python\connect4Project\other_version\datasets\connect4_mid.xlsx"
file_path_late = r"C:\Users\arcan\OneDrive\Ambiente de Trabalho\My apps\python\connect4Project\other_version\datasets\connect4_late.xlsx"

for file_path in [file_path_early,file_path_mid,file_path_late]:
    df = pd.read_excel(file_path)

    # Check for duplicates excluding the 'result' column
    duplicate_rows = df[df.duplicated(subset=df.columns[:-1], keep=False)]

    # Count and display the number of duplicate rows and show a few examples
    duplicate_count = len(duplicate_rows)
    duplicate_rows_sample = duplicate_rows.head()

    duplicate_count, duplicate_rows_sample