In [10]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score
from itertools import product
import sys
import os

# Add the parent directory to the Python path and import function
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from src.generate_data import DataGenerator
from src.prepare_data import DataPreprocessor

In [14]:
# Define all parameter variations

# for the DGP
n_classes_dep_var = [2, 3, 4]
n_ind_vars = [3, 5, 7]
n_categorical_vars = np.arange(7)
n_classes_ind_vars = [2, 3]
n_samples = [100, 500, 1000, 10000]
preprocessor = DataPreprocessor()

# for DT
splitting_criteria = ['entropy', 'gini']  # Equivalent to 'Entropy reduction' and 'Gini reduction'
min_samples_leaf_percents = [0.05, 0.1]  # 5% and 10% of sample size
min_samples_split_percents = [0.1, 0.2]  # 10% and 20% of sample size

In [22]:
results_list = []
# loop over all data sets
np.random.seed(3103)
idx = 0
for n_classes_dep, n_ind, n_categorical, n_classes_ind, n_samp in product(n_classes_dep_var, n_ind_vars, n_categorical_vars, n_classes_ind_vars, n_samples):
    data_generator = DataGenerator(
        n_classes_dep_var=n_classes_dep, 
        n_ind_vars=n_ind,        
        n_categorical_vars=n_categorical,
        n_classes_ind_vars=n_classes_ind,
        n_samples=n_samp         
    )
    data = data_generator.generate_data()
    idx += 1
    # data = generate_data(n_classes_dep, n_ind, n_categorical, n_classes_ind, n_samp)
    if data is not None:  # Avoid using if no data is generated 
        X_train, X_test, y_train, y_test = preprocessor.preprocess(data)

        for criterion, min_samples_leaf_percent, min_samples_split_percent in product(
            splitting_criteria, min_samples_leaf_percents, min_samples_split_percents
        ):
            min_samples_leaf = int(min_samples_leaf_percent * n_samp)
            min_samples_split = int(min_samples_split_percent * n_samp)

            clf = tree.DecisionTreeClassifier(
                criterion=criterion,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split
            )
            clf = clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            misclassification = 1 - accuracy_score(y_test, y_pred)
            
            # Append results for this configuration
            results_list.append([
                f'S{idx}',                 # Simulation ID
                n_classes_dep,             # Dependent variable classes
                n_ind,                     # Number of independent variables
                n_categorical,             # Number of categorical variables
                n_classes_ind,             # Categorical independent variable classes
                n_samp,                    # Sample size
                criterion,                 # Splitting criterion
                min_samples_leaf_percent,  # Minimum samples per leaf (percent)
                min_samples_split_percent, # Minimum samples for split (percent)
                misclassification          # Misclassification rate
            ])


In [23]:
# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results_list, columns=[
    'Simulation_ID', 
    'Classes_Dep_Var', 
    'Num_Ind_Vars', 
    'Num_Categorical_Vars', 
    'Classes_Ind_Vars', 
    'Sample_Size', 
    'Splitting_Criterion', 
    'Min_Samples_Leaf_Percent', 
    'Min_Samples_Split_Percent', 
    'Misclassification'
])
results_df.head(20)

Unnamed: 0,Simulation_ID,Classes_Dep_Var,Num_Ind_Vars,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Splitting_Criterion,Min_Samples_Leaf_Percent,Min_Samples_Split_Percent,Misclassification
0,S1,2,3,0,2,100,entropy,0.05,0.1,0.3
1,S1,2,3,0,2,100,entropy,0.05,0.2,0.3
2,S1,2,3,0,2,100,entropy,0.1,0.1,0.333333
3,S1,2,3,0,2,100,entropy,0.1,0.2,0.333333
4,S1,2,3,0,2,100,gini,0.05,0.1,0.366667
5,S1,2,3,0,2,100,gini,0.05,0.2,0.366667
6,S1,2,3,0,2,100,gini,0.1,0.1,0.266667
7,S1,2,3,0,2,100,gini,0.1,0.2,0.266667
8,S2,2,3,0,2,500,entropy,0.05,0.1,0.213333
9,S2,2,3,0,2,500,entropy,0.05,0.2,0.286667


TODO: add the F-test splitting criterion