# Generate Decision Tree Dataset

This file is responsible for generating a dataset of each Decision Tree model predictions and each true labels

### Import Statements

In [9]:
import pandas as pd
import os 
import sys

p = os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))
sys.path.append(p)

from DecisionTree.ID3Tree import ID3Tree
from DecisionTree.Ruleset import Ruleset
from DecisionTree.Bootstrap_Aggregating import Bagging
from sklearn.model_selection import train_test_split


### Create pd.DataFrame as dataset

In [10]:
results = pd.DataFrame(columns=['ID3', 'Ruleset', 'Bagging', 'Actual'])
results.columns

Index(['ID3', 'Ruleset', 'Bagging', 'Actual'], dtype='object')

### Load MCTS data

In [11]:
data_file = os.path.join(p, 'datasets', 'monte_carlo_data.csv')
data = pd.read_csv(data_file, delimiter=';')
data['played'] = data['played'].astype(int)
data

Unnamed: 0,cel1,cel2,cel3,cel4,cel5,cel6,cel7,cel8,cel9,cel10,...,cel36,cel37,cel38,cel39,cel40,cel41,cel42,pieces,turn,played
0,0,0,0,0,0,0,0,0,0,0,...,0,-1,-1,1,1,-1,1,12,1,5
1,0,0,0,0,0,0,0,0,0,0,...,0,-1,1,1,-1,-1,-1,13,1,5
2,0,0,0,0,0,0,0,0,0,0,...,0,-1,-1,1,-1,1,1,7,1,5
3,0,0,0,0,0,-1,0,0,0,0,...,-1,1,0,1,-1,1,-1,17,1,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-1,1,5,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30012,0,0,0,0,0,0,0,0,0,0,...,-1,1,-1,-1,0,1,1,10,1,3
30013,0,0,0,0,0,0,0,0,0,0,...,0,-1,-1,1,0,1,0,9,1,3
30014,0,0,0,0,0,0,0,0,0,0,...,1,1,-1,-1,1,-1,1,17,1,1
30015,0,0,0,0,0,0,1,0,0,0,...,1,-1,-1,1,-1,-1,1,21,1,2


In [12]:
cel_columns = [f'cel{i}' for i in range(1, 43)]

# Update the DataFrame
data.loc[data['turn'] == -1, cel_columns] *= -1  # Flip -1 to 1 and 1 to -1
data['turn'] = 1  # Change turn to 1
data.drop(columns=['turn'], inplace=True)
data

Unnamed: 0,cel1,cel2,cel3,cel4,cel5,cel6,cel7,cel8,cel9,cel10,...,cel35,cel36,cel37,cel38,cel39,cel40,cel41,cel42,pieces,played
0,0,0,0,0,0,0,0,0,0,0,...,-1,0,-1,-1,1,1,-1,1,12,5
1,0,0,0,0,0,0,0,0,0,0,...,1,0,-1,1,1,-1,-1,-1,13,5
2,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,-1,1,-1,1,1,7,5
3,0,0,0,0,0,-1,0,0,0,0,...,-1,-1,1,0,1,-1,1,-1,17,3
4,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,-1,1,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30012,0,0,0,0,0,0,0,0,0,0,...,0,-1,1,-1,-1,0,1,1,10,3
30013,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,-1,1,0,1,0,9,3
30014,0,0,0,0,0,0,0,0,0,0,...,0,1,1,-1,-1,1,-1,1,17,1
30015,0,0,0,0,0,0,1,0,0,0,...,-1,1,-1,-1,1,-1,-1,1,21,2


In [13]:
X, y = data[data.columns[:-1]], data[data.columns[-1]]
X = X.to_numpy().tolist()
y = y.to_numpy().tolist()
feature_names = data.columns[:-1].tolist()
print("Feature names: ", feature_names)
print(y)

Feature names:  ['cel1', 'cel2', 'cel3', 'cel4', 'cel5', 'cel6', 'cel7', 'cel8', 'cel9', 'cel10', 'cel11', 'cel12', 'cel13', 'cel14', 'cel15', 'cel16', 'cel17', 'cel18', 'cel19', 'cel20', 'cel21', 'cel22', 'cel23', 'cel24', 'cel25', 'cel26', 'cel27', 'cel28', 'cel29', 'cel30', 'cel31', 'cel32', 'cel33', 'cel34', 'cel35', 'cel36', 'cel37', 'cel38', 'cel39', 'cel40', 'cel41', 'cel42', 'pieces']
[5, 5, 5, 3, 6, 4, 6, 3, 5, 2, 3, 2, 5, 4, 2, 4, 0, 3, 5, 4, 4, 4, 3, 3, 4, 4, 3, 5, 2, 4, 2, 5, 3, 4, 6, 1, 4, 4, 1, 2, 2, 1, 2, 1, 1, 5, 5, 3, 2, 6, 3, 1, 3, 2, 4, 3, 2, 4, 3, 6, 4, 2, 2, 6, 0, 1, 3, 2, 3, 3, 5, 4, 4, 1, 4, 1, 2, 0, 1, 5, 3, 1, 0, 1, 5, 3, 1, 2, 4, 0, 4, 1, 1, 4, 1, 5, 3, 6, 2, 1, 2, 4, 0, 4, 3, 3, 6, 3, 2, 3, 1, 2, 1, 3, 3, 4, 0, 6, 5, 3, 2, 3, 0, 0, 2, 4, 5, 5, 4, 3, 5, 3, 4, 2, 4, 4, 6, 1, 1, 5, 5, 4, 1, 3, 5, 6, 3, 4, 2, 3, 4, 4, 2, 4, 6, 4, 3, 2, 3, 3, 1, 1, 2, 3, 3, 2, 4, 3, 4, 4, 3, 5, 3, 4, 1, 3, 4, 0, 5, 1, 4, 2, 3, 2, 4, 2, 5, 3, 0, 0, 4, 5, 3, 4, 4, 5, 5, 2, 3, 3, 4, 

### Create init variables

In [14]:
def attribute_vartypes(attribute):
    """
    Function to determine the attribute types of the dataset.
    :param X: The dataset
    :return: A list of attribute types
    """

    return 'continuous' if attribute == 'pieces' else 'discrete'

In [15]:
# Combine features and labels
data = [x + [label] for x, label in zip(X, y)]

# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
type_map = {attribute : attribute_vartypes(attribute) for attribute in feature_names}
print("Type map: ", type_map)


Type map:  {'cel1': 'discrete', 'cel2': 'discrete', 'cel3': 'discrete', 'cel4': 'discrete', 'cel5': 'discrete', 'cel6': 'discrete', 'cel7': 'discrete', 'cel8': 'discrete', 'cel9': 'discrete', 'cel10': 'discrete', 'cel11': 'discrete', 'cel12': 'discrete', 'cel13': 'discrete', 'cel14': 'discrete', 'cel15': 'discrete', 'cel16': 'discrete', 'cel17': 'discrete', 'cel18': 'discrete', 'cel19': 'discrete', 'cel20': 'discrete', 'cel21': 'discrete', 'cel22': 'discrete', 'cel23': 'discrete', 'cel24': 'discrete', 'cel25': 'discrete', 'cel26': 'discrete', 'cel27': 'discrete', 'cel28': 'discrete', 'cel29': 'discrete', 'cel30': 'discrete', 'cel31': 'discrete', 'cel32': 'discrete', 'cel33': 'discrete', 'cel34': 'discrete', 'cel35': 'discrete', 'cel36': 'discrete', 'cel37': 'discrete', 'cel38': 'discrete', 'cel39': 'discrete', 'cel40': 'discrete', 'cel41': 'discrete', 'cel42': 'discrete', 'pieces': 'continuous'}


### Explanation of the Decision Tree Results Generation Code

This code trains three different decision tree-based models (ID3, Ruleset, and Bagging) on Connect Four data and evaluates their predictions on a test set. The results are saved to a CSV file. Here’s a breakdown of the process:

1. **Model Training:**
   - **ID3 Tree:** Trains an ID3 decision tree and extracts its rules.
   - **Ruleset:** Trains a ruleset-based classifier.
   - **Bagging:** Trains a bagging ensemble of decision trees.

2. **Prediction Function (`process_row`):**
   - For each row in the test set, it predicts the move using:
     - The ID3 rules (returns the first non-None prediction, or -1 if none found).
     - The ruleset model.
     - The bagging model.
   - Returns a list containing all three predictions and the actual move.

3. **Parallel Prediction:**
   - Uses Python’s `ThreadPoolExecutor` to speed up predictions by processing multiple rows in parallel.

4. **Results Saving:**
   - Collects all predictions and actual values into a DataFrame.
   - Saves the DataFrame as `decision_tree_data.csv` in the `datasets` folder.

5. **Safety Prompt:**
   - Before running, the code will warn you that running this will overwrite the results file and may take a long time.


In [16]:
from concurrent.futures import ThreadPoolExecutor

def process_row(row, rules, ruleset, bagging):
    # ID3 Tree Prediction
    id3_pred = None
    for rule in rules:
        id3_pred = rule.predict(row)
        if id3_pred is not None:
            break
    if id3_pred is None:
        id3_pred = -1  # ERROR_CLASS

    # Ruleset Prediction
    ruleset_pred, _ = ruleset.predict(row)

    # Bagging Prediction
    bagging_pred, _ = bagging.predict(row)

    return [id3_pred, ruleset_pred, bagging_pred, row[-1]]

def generate_results():
    print("Training ID3 Tree")
    tree = ID3Tree(attributes=feature_names, data=train_data, default=0, type_map=type_map)
    tree.train()
    rules = tree.build_rules()
    tree.save_model(os.path.join(p, 'models', 'id3_analize.pkl'))

    print("Training Ruleset")
    ruleset = Ruleset(feature_names, train_data, 0, type_map)
    ruleset.train()
    ruleset.save_model(os.path.join(p, 'models', 'ruleset_analize.pkl'))

    print("Training Bagging")
    bagging = Bagging(feature_names, train_data, 0, type_map)
    bagging.train()
    bagging.save_model(os.path.join(p, 'models', 'bagging_analize.pkl'))

    print("Starting predictions with multithreading...")
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_row, row, rules, ruleset, bagging) for row in test_data]
        results_list = [f.result() for f in futures]

    results_df = pd.DataFrame(results_list, columns=["ID3_Pred", "Ruleset_Pred", "Bagging_Pred", "Actual"])
    save_path = os.path.join(p, 'datasets', 'decision_tree_data_AI_VS_AI.csv')
    results_df.to_csv(save_path, index=False, sep=';')

    print("Results saved to", save_path)


print ("DANGER: you are about to overwrite the file with new data")
print ("This process will take a long time to run and will overwrite the file, potentially losing the already generated data")
string = input("Are you sure you want to continue? (yes/no): ")
if (string == "yes"): 
    generate_results()       # PLEASE BE CAREFUL WITH THIS FUNCTION, IT WILL TAKE A LONG TIME TO RUN AND WILL OVERWRITE THE FILE
#                            # POTENTIALLY LOSING THE ALREADY GENERATED DATA

DANGER: you are about to overwrite the file with new data
This process will take a long time to run and will overwrite the file, potentially losing the already generated data
Training ID3 Tree
Model saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\models\id3_analize.pkl
Training Ruleset
Model saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\models\ruleset_analize.pkl
Training Bagging
Training classifier #1
Training classifier #2
Training classifier #3
Training classifier #4
Training classifier #5
Training classifier #6
Training classifier #7
Training classifier #8
Training classifier #9
Training classifier #10
Model saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\models\bagging_analize.pkl
Starting predictions with multithreading...
Results saved to c:\Users\diogo\OneDrive\Documents\GitHub\MCTS_and_DecisionTree_for_ConnectFourGame\datasets\decision_tree_data