# Generate Decision Tree Dataset

This file is responsible for generating a dataset of each Decision Tree model predictions and each true labels

### Import Statements

In [1]:
import pandas as pd
import os 
import sys

p = os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))
sys.path.append(p)

from DecisionTree.ID3Tree import ID3Tree
from DecisionTree.Ruleset import Ruleset
from DecisionTree.Bootstrap_Aggregating import Bagging
from sklearn.model_selection import train_test_split


### Create pd.DataFrame as dataset

In [2]:
results = pd.DataFrame(columns=['ID3', 'Ruleset', 'Bagging', 'Actual'])
results.columns

Index(['ID3', 'Ruleset', 'Bagging', 'Actual'], dtype='object')

### Load MCTS data

In [3]:
data_file = os.path.join(p, 'datasets', 'monte_carlo_AI_VS_AI.csv')
data = pd.read_csv(data_file, delimiter=';')
data['played'] = data['played'].astype(int)
data

Unnamed: 0,cel1,cel2,cel3,cel4,cel5,cel6,cel7,cel8,cel9,cel10,...,cel36,cel37,cel38,cel39,cel40,cel41,cel42,pieces,turn,played
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,-1,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,-1,0,0,2,1,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,-1,0,0,3,-1,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,-1,0,0,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14734,0,0,-1,1,0,0,0,0,-1,1,...,1,1,-1,1,-1,-1,-1,30,1,1
14735,0,1,-1,1,0,0,0,0,-1,1,...,1,1,-1,1,-1,-1,-1,31,-1,6
14736,0,1,-1,1,0,0,-1,0,-1,1,...,1,1,-1,1,-1,-1,-1,32,1,4
14737,0,1,-1,1,1,0,-1,0,-1,1,...,1,1,-1,1,-1,-1,-1,33,-1,0


In [4]:
cel_columns = [f'cel{i}' for i in range(1, 43)]

# Update the DataFrame
data.loc[data['turn'] == -1, cel_columns] *= -1  # Flip -1 to 1 and 1 to -1
data['turn'] = 1  # Change turn to 1
data.drop(columns=['turn'], inplace=True)
data

Unnamed: 0,cel1,cel2,cel3,cel4,cel5,cel6,cel7,cel8,cel9,cel10,...,cel35,cel36,cel37,cel38,cel39,cel40,cel41,cel42,pieces,played
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,-1,0,0,0,1,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,-1,0,0,2,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,-1,1,0,0,3,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,-1,0,0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14734,0,0,-1,1,0,0,0,0,-1,1,...,1,1,1,-1,1,-1,-1,-1,30,1
14735,0,-1,1,-1,0,0,0,0,1,-1,...,-1,-1,-1,1,-1,1,1,1,31,6
14736,0,1,-1,1,0,0,-1,0,-1,1,...,1,1,1,-1,1,-1,-1,-1,32,4
14737,0,-1,1,-1,-1,0,1,0,1,-1,...,-1,-1,-1,1,-1,1,1,1,33,0


In [5]:
# Generate column names
c = [f"player1_cel{i}" for i in range(1, 43)]

# Extract and rename columns from the original dataset
player_1 = data[cel_columns].copy()
player_1.columns = c  # Rename columns to desired format

# Replace -1 with 0
player_1.replace(-1, 0, inplace=True)

c = [f"player2_cel{i}" for i in range(1, 43)]
player_2 = data[cel_columns].copy()
player_2.columns = c  # Rename columns to desired format
# Replace -1 with 0
player_2.replace(1, 0, inplace=True)
player_2.replace(-1, 1, inplace=True)

# Concatenate the two DataFrames
data = pd.concat([player_1, player_2, data['pieces'], data['played']], axis=1)
data

Unnamed: 0,player1_cel1,player1_cel2,player1_cel3,player1_cel4,player1_cel5,player1_cel6,player1_cel7,player1_cel8,player1_cel9,player1_cel10,...,player2_cel35,player2_cel36,player2_cel37,player2_cel38,player2_cel39,player2_cel40,player2_cel41,player2_cel42,pieces,played
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,2,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,3,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14734,0,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,1,1,1,30,1
14735,0,0,1,0,0,0,0,0,1,0,...,1,1,1,0,1,0,0,0,31,6
14736,0,1,0,1,0,0,0,0,0,1,...,0,0,0,1,0,1,1,1,32,4
14737,0,0,1,0,0,0,1,0,1,0,...,1,1,1,0,1,0,0,0,33,0


In [6]:
X, y = data[data.columns[:-1]], data[data.columns[-1]]
X = X.to_numpy().tolist()
y = y.to_numpy().tolist()
feature_names = data.columns[:-1].tolist()
print("Feature names: ", feature_names)
print(y)

Feature names:  ['player1_cel1', 'player1_cel2', 'player1_cel3', 'player1_cel4', 'player1_cel5', 'player1_cel6', 'player1_cel7', 'player1_cel8', 'player1_cel9', 'player1_cel10', 'player1_cel11', 'player1_cel12', 'player1_cel13', 'player1_cel14', 'player1_cel15', 'player1_cel16', 'player1_cel17', 'player1_cel18', 'player1_cel19', 'player1_cel20', 'player1_cel21', 'player1_cel22', 'player1_cel23', 'player1_cel24', 'player1_cel25', 'player1_cel26', 'player1_cel27', 'player1_cel28', 'player1_cel29', 'player1_cel30', 'player1_cel31', 'player1_cel32', 'player1_cel33', 'player1_cel34', 'player1_cel35', 'player1_cel36', 'player1_cel37', 'player1_cel38', 'player1_cel39', 'player1_cel40', 'player1_cel41', 'player1_cel42', 'player2_cel1', 'player2_cel2', 'player2_cel3', 'player2_cel4', 'player2_cel5', 'player2_cel6', 'player2_cel7', 'player2_cel8', 'player2_cel9', 'player2_cel10', 'player2_cel11', 'player2_cel12', 'player2_cel13', 'player2_cel14', 'player2_cel15', 'player2_cel16', 'player2_cel17'

### Create init variables

In [7]:
def attribute_vartypes(attribute):
    """
    Function to determine the attribute types of the dataset.
    :param X: The dataset
    :return: A list of attribute types
    """

    return 'continuous' if attribute == 'pieces' else 'discrete'

In [8]:
# Combine features and labels
data = [x + [label] for x, label in zip(X, y)]

# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
type_map = {attribute : attribute_vartypes(attribute) for attribute in feature_names}
print("Type map: ", type_map)


Type map:  {'player1_cel1': 'discrete', 'player1_cel2': 'discrete', 'player1_cel3': 'discrete', 'player1_cel4': 'discrete', 'player1_cel5': 'discrete', 'player1_cel6': 'discrete', 'player1_cel7': 'discrete', 'player1_cel8': 'discrete', 'player1_cel9': 'discrete', 'player1_cel10': 'discrete', 'player1_cel11': 'discrete', 'player1_cel12': 'discrete', 'player1_cel13': 'discrete', 'player1_cel14': 'discrete', 'player1_cel15': 'discrete', 'player1_cel16': 'discrete', 'player1_cel17': 'discrete', 'player1_cel18': 'discrete', 'player1_cel19': 'discrete', 'player1_cel20': 'discrete', 'player1_cel21': 'discrete', 'player1_cel22': 'discrete', 'player1_cel23': 'discrete', 'player1_cel24': 'discrete', 'player1_cel25': 'discrete', 'player1_cel26': 'discrete', 'player1_cel27': 'discrete', 'player1_cel28': 'discrete', 'player1_cel29': 'discrete', 'player1_cel30': 'discrete', 'player1_cel31': 'discrete', 'player1_cel32': 'discrete', 'player1_cel33': 'discrete', 'player1_cel34': 'discrete', 'player1_ce

### Explanation of the Decision Tree Results Generation Code

This code trains three different decision tree-based models (ID3, Ruleset, and Bagging) on Connect Four data and evaluates their predictions on a test set. The results are saved to a CSV file. Here’s a breakdown of the process:

1. **Model Training:**
   - **ID3 Tree:** Trains an ID3 decision tree and extracts its rules.
   - **Ruleset:** Trains a ruleset-based classifier.
   - **Bagging:** Trains a bagging ensemble of decision trees.

2. **Prediction Function (`process_row`):**
   - For each row in the test set, it predicts the move using:
     - The ID3 rules (returns the first non-None prediction, or -1 if none found).
     - The ruleset model.
     - The bagging model.
   - Returns a list containing all three predictions and the actual move.

3. **Parallel Prediction:**
   - Uses Python’s `ThreadPoolExecutor` to speed up predictions by processing multiple rows in parallel.

4. **Results Saving:**
   - Collects all predictions and actual values into a DataFrame.
   - Saves the DataFrame as `decision_tree_data.csv` in the `datasets` folder.

5. **Safety Prompt:**
   - Before running, the code will warn you that running this will overwrite the results file and may take a long time.


In [9]:
from concurrent.futures import ThreadPoolExecutor

def process_row(row, rules, ruleset, bagging):
    # ID3 Tree Prediction
    id3_pred = None
    for rule in rules:
        id3_pred = rule.predict(row)
        if id3_pred is not None:
            break
    if id3_pred is None:
        id3_pred = -1  # ERROR_CLASS

    # Ruleset Prediction
    ruleset_pred, _ = ruleset.predict(row)

    # Bagging Prediction
    bagging_pred, _ = bagging.predict(row)

    return [id3_pred, ruleset_pred, bagging_pred, row[-1]]

def generate_results():
    print("Training ID3 Tree")
    tree = ID3Tree(attributes=feature_names, data=train_data, default=0, type_map=type_map)
    tree.train()
    rules = tree.build_rules()
    tree.save_model(os.path.join(p, 'models', 'id3_analize.pkl'))

    print("Training Ruleset")
    ruleset = Ruleset(feature_names, train_data, 0, type_map)
    ruleset.train()
    ruleset.save_model(os.path.join(p, 'models', 'ruleset_analize.pkl'))

    print("Training Bagging")
    bagging = Bagging(feature_names, train_data, 0, type_map)
    bagging.train()
    bagging.save_model(os.path.join(p, 'models', 'bagging_analize.pkl'))

    print("Starting predictions with multithreading...")
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_row, row, rules, ruleset, bagging) for row in test_data]
        results_list = [f.result() for f in futures]

    results_df = pd.DataFrame(results_list, columns=["ID3_Pred", "Ruleset_Pred", "Bagging_Pred", "Actual"])
    save_path = os.path.join(p, 'datasets', 'decision_tree_data_AI_VS_AI.csv')
    results_df.to_csv(save_path, index=False, sep=';')

    print("Results saved to", save_path)


print ("DANGER: you are about to overwrite the file with new data")
print ("This process will take a long time to run and will overwrite the file, potentially losing the already generated data")
string = input("Are you sure you want to continue? (yes/no): ")
if (string == "yes"): 
    generate_results()       # PLEASE BE CAREFUL WITH THIS FUNCTION, IT WILL TAKE A LONG TIME TO RUN AND WILL OVERWRITE THE FILE
#                            # POTENTIALLY LOSING THE ALREADY GENERATED DATA

DANGER: you are about to overwrite the file with new data
This process will take a long time to run and will overwrite the file, potentially losing the already generated data
Training ID3 Tree
Model saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\models\id3_analize.pkl
Training Ruleset
Model saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\models\ruleset_analize.pkl
Training Bagging
Training classifier #1
Training classifier #2
Training classifier #3
Training classifier #4
Training classifier #5
Training classifier #6
Training classifier #7
Training classifier #8
Training classifier #9
Training classifier #10
Model saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\models\bagging_analize.pkl
Starting predictions with multithreading...
Results saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\datasets\decision_tree_data