In [1]:
import pandas as pd
df = pd.read_csv("../dat/fixed_model_results.csv")
df = df.fillna({'Hyperparameter_2': 99, 'Hyperparameter_3': 99})

In [2]:
# continuous independent variables for now
df_cont = df[df["Num_Categorical_Vars"] == 0]

In [3]:
display(df_cont)

Unnamed: 0,Iteration,Classes_Dep_Var,Num_Ind_Vars,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Model,Hyperparameter_1,Hyperparameter_2,Hyperparameter_3,Misclassification,Accuracy,Precision,Recall,F1_Score
0,1,2,3,0,2,100,LogisticRegression,logit,99.00,99,0.133333,0.866667,0.866667,0.866667,0.866667
1,1,2,3,0,2,100,LogisticRegression,probit,99.00,99,0.133333,0.866667,0.866667,0.866667,0.866667
2,1,2,3,0,2,100,DecisionTree,entropy,0.05,0.1,0.266667,0.733333,0.737557,0.733333,0.732143
3,1,2,3,0,2,100,DecisionTree,entropy,0.05,0.2,0.266667,0.733333,0.737557,0.733333,0.732143
4,1,2,3,0,2,100,DecisionTree,entropy,0.10,0.1,0.266667,0.733333,0.737557,0.733333,0.732143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10483,1,4,7,0,2,10000,ANN,2,3.00,Adam,0.484333,0.515667,0.496836,0.515667,0.492897
10484,1,4,7,0,2,10000,ANN,2,3.00,RMSprop,0.585667,0.414333,0.327838,0.414333,0.344036
10485,1,4,7,0,2,10000,ANN,2,15.00,SGD,0.475000,0.525000,0.511672,0.525000,0.509610
10486,1,4,7,0,2,10000,ANN,2,15.00,Adam,0.502000,0.498000,0.489042,0.498000,0.464417


In [4]:
groupby_list = ['Classes_Dep_Var', 'Num_Ind_Vars', 'Sample_Size', 'Model', 'Hyperparameter_1', 'Hyperparameter_2', 'Hyperparameter_3']
# get the best performing specification for each model x n_vars x sample size combination without RandomForest
df_specific_means = df_cont.groupby(groupby_list)['Misclassification'].mean()
idx = df_specific_means.groupby(['Classes_Dep_Var', 'Num_Ind_Vars', 'Sample_Size', 'Model']).idxmin()
results_cont = df_specific_means[idx]
results_cont

Classes_Dep_Var  Num_Ind_Vars  Sample_Size  Model               Hyperparameter_1  Hyperparameter_2  Hyperparameter_3
2                3             100          ANN                 1                 3.00              Adam                0.100000
                                            DecisionTree        f_test            0.05              0.1                 0.233333
                                            LogisticRegression  logit             99.00             99                  0.133333
                                            RandomForest        f_test            0.05              0.1                 0.166667
                               500          ANN                 1                 15.00             RMSprop             0.180000
                                                                                                                          ...   
4                7             1000         RandomForest        gini              0.10              0.2      

In [5]:
results_cont.groupby(['Classes_Dep_Var', 'Model']).mean()

Classes_Dep_Var  Model             
2                ANN                   0.201972
                 DecisionTree          0.268361
                 LogisticRegression    0.211472
                 RandomForest          0.241444
3                ANN                   0.343778
                 DecisionTree          0.425667
                 LogisticRegression    0.354028
                 RandomForest          0.437528
4                ANN                   0.471389
                 DecisionTree          0.541889
                 LogisticRegression    0.501528
                 RandomForest          0.575667
Name: Misclassification, dtype: float64

These are still different results from the ones in exp_FP_08

In [26]:
grouped = df_cont.groupby(['Classes_Dep_Var', 'Num_Ind_Vars', 'Sample_Size', 'Model'])
result_cont = []

for group, frame in grouped:
    # Find the combination of hyperparameters that minimizes misclassification error
    best_hyperparams = frame.loc[frame['Misclassification'].idxmin(), ['Hyperparameter_1', 'Hyperparameter_2', 'Hyperparameter_3']]
    best_frame = frame[(frame['Hyperparameter_1'] == best_hyperparams['Hyperparameter_1']) &
                       (frame['Hyperparameter_2'] == best_hyperparams['Hyperparameter_2']) &
                       (frame['Hyperparameter_3'] == best_hyperparams['Hyperparameter_3'])]
    mean_misclassification = best_frame['Misclassification'].mean()

    result_cont.append({
        'Classes_Dep_Var': group[0],
        'Num_Ind_Vars': group[1],
        'Sample_Size': group[2],
        'Model': group[3],
        'Mean_Misclassification': mean_misclassification
    })


results_cont = pd.DataFrame(result_cont)

# Pivot table for the desired format
table3 = results_cont.pivot_table(
    index=['Classes_Dep_Var', 'Num_Ind_Vars', 'Sample_Size'],
    columns='Model',
    values='Mean_Misclassification'
)

table3 = table3.rename(columns={
    'DecisionTree': 'DT',
    'LogisticRegression': 'LR',
    'RandomForest': 'RF'
})

In [27]:
table3.to_csv('../dat/table3.csv')

### Table 4. Experimental results: misclassification errors for ANN, DT and LR with three independent variables, including categorical variables (V = 3).

In [16]:
# continuous independent variables for now
df_3 = df[(df["Num_Categorical_Vars"] != 0) & (df['Num_Ind_Vars'] == 3)]
df_3

Unnamed: 0,Iteration,Classes_Dep_Var,Num_Ind_Vars,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Model,Hyperparameter_1,Hyperparameter_2,Hyperparameter_3,Misclassification,Accuracy,Precision,Recall,F1_Score
152,1,2,3,1,2,100,LogisticRegression,logit,99.00,99,0.300000,0.700000,0.700893,0.700000,0.699666
153,1,2,3,1,2,100,LogisticRegression,probit,99.00,99,0.300000,0.700000,0.700893,0.700000,0.699666
154,1,2,3,1,2,100,DecisionTree,entropy,0.05,0.1,0.166667,0.833333,0.847222,0.833333,0.831650
155,1,2,3,1,2,100,DecisionTree,entropy,0.05,0.2,0.166667,0.833333,0.847222,0.833333,0.831650
156,1,2,3,1,2,100,DecisionTree,entropy,0.10,0.1,0.166667,0.833333,0.847222,0.833333,0.831650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8963,1,4,3,2,3,10000,ANN,2,3.00,Adam,0.347333,0.652667,0.676491,0.652667,0.657334
8964,1,4,3,2,3,10000,ANN,2,3.00,RMSprop,0.441667,0.558333,0.672888,0.558333,0.540151
8965,1,4,3,2,3,10000,ANN,2,15.00,SGD,0.343667,0.656333,0.686251,0.656333,0.664785
8966,1,4,3,2,3,10000,ANN,2,15.00,Adam,0.380000,0.620000,0.604730,0.620000,0.602921


In [17]:
grouped = df_3.groupby(['Classes_Dep_Var', 'Num_Categorical_Vars', 'Classes_Ind_Vars', 'Sample_Size', 'Model'])
result_3 = []

for group, frame in grouped:
    # Find the combination of hyperparameters that minimizes misclassification error
    best_hyperparams = frame.loc[frame['Misclassification'].idxmin(), ['Hyperparameter_1', 'Hyperparameter_2', 'Hyperparameter_3']]
    best_frame = frame[(frame['Hyperparameter_1'] == best_hyperparams['Hyperparameter_1']) &
                       (frame['Hyperparameter_2'] == best_hyperparams['Hyperparameter_2']) &
                       (frame['Hyperparameter_3'] == best_hyperparams['Hyperparameter_3'])]
    mean_misclassification = best_frame['Misclassification'].mean()

    result_3.append({
        'Classes_Dep_Var': group[0],
        'Num_Categorical_Vars': group[1],
        'Classes_Ind_Vars': group[2],
        'Sample_Size': group[3],
        'Model': group[4],
        'Mean_Misclassification': mean_misclassification
    })


results_3 = pd.DataFrame(result_3)

# Pivot table for the desired format
table4 = results_3.pivot_table(
    index=['Classes_Dep_Var', 'Num_Categorical_Vars', 'Classes_Ind_Vars', 'Sample_Size'],
    columns='Model',
    values='Mean_Misclassification'
)

table4 = table4.rename(columns={
    'DecisionTree': 'DT',
    'LogisticRegression': 'LR',
    'RandomForest': 'RF'
})

table4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Model,ANN,DecisionTree,LogisticRegression,RandomForest
Classes_Dep_Var,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,1,2,100,0.2,0.166667,0.3,0.266667
2,1,2,500,0.24,0.293333,0.24,0.246667
2,1,2,1000,0.146667,0.18,0.16,0.166667
2,1,2,10000,0.181333,0.198667,0.183667,0.209333
2,1,3,100,0.133333,0.133333,0.166667,0.233333
2,1,3,500,0.126667,0.126667,0.146667,0.193333
2,1,3,1000,0.166667,0.196667,0.16,0.186667
2,1,3,10000,0.150333,0.167667,0.153,0.202333
2,2,2,100,0.133333,0.3,0.233333,0.066667
2,2,2,500,0.16,0.186667,0.166667,0.186667


In [18]:
table4.to_csv('../dat/table4.csv')

### Table 5. Experimental results: misclassification errors for ANN, DT and LR with five independent variables, including categorical variables (V = 5).

In [19]:
df_5 = df[(df["Num_Categorical_Vars"] != 0) & (df['Num_Ind_Vars'] == 5)]
df_5

Unnamed: 0,Iteration,Classes_Dep_Var,Num_Ind_Vars,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Model,Hyperparameter_1,Hyperparameter_2,Hyperparameter_3,Misclassification,Accuracy,Precision,Recall,F1_Score
912,1,2,5,1,2,100,LogisticRegression,logit,99.00,99,0.200000,0.800000,0.805430,0.800000,0.799107
913,1,2,5,1,2,100,LogisticRegression,probit,99.00,99,0.200000,0.800000,0.805430,0.800000,0.799107
914,1,2,5,1,2,100,DecisionTree,entropy,0.05,0.1,0.333333,0.666667,0.666667,0.666667,0.666667
915,1,2,5,1,2,100,DecisionTree,entropy,0.05,0.2,0.333333,0.666667,0.666667,0.666667,0.666667
916,1,2,5,1,2,100,DecisionTree,entropy,0.10,0.1,0.333333,0.666667,0.666667,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10331,1,4,5,4,3,10000,ANN,2,3.00,Adam,0.344000,0.656000,0.689156,0.656000,0.660830
10332,1,4,5,4,3,10000,ANN,2,3.00,RMSprop,0.467333,0.532667,0.438000,0.532667,0.471731
10333,1,4,5,4,3,10000,ANN,2,15.00,SGD,0.340000,0.660000,0.646807,0.660000,0.644484
10334,1,4,5,4,3,10000,ANN,2,15.00,Adam,0.340000,0.660000,0.686880,0.660000,0.666508


In [20]:
grouped = df_5.groupby(['Classes_Dep_Var', 'Num_Categorical_Vars', 'Classes_Ind_Vars', 'Sample_Size', 'Model'])
result_5 = []

for group, frame in grouped:
    # Find the combination of hyperparameters that minimizes misclassification error
    best_hyperparams = frame.loc[frame['Misclassification'].idxmin(), ['Hyperparameter_1', 'Hyperparameter_2', 'Hyperparameter_3']]
    best_frame = frame[(frame['Hyperparameter_1'] == best_hyperparams['Hyperparameter_1']) &
                       (frame['Hyperparameter_2'] == best_hyperparams['Hyperparameter_2']) &
                       (frame['Hyperparameter_3'] == best_hyperparams['Hyperparameter_3'])]
    mean_misclassification = best_frame['Misclassification'].mean()

    result_5.append({
        'Classes_Dep_Var': group[0],
        'Num_Categorical_Vars': group[1],
        'Classes_Ind_Vars': group[2],
        'Sample_Size': group[3],
        'Model': group[4],
        'Mean_Misclassification': mean_misclassification
    })


results_5 = pd.DataFrame(result_5)

# Pivot table for the desired format
table5 = results_5.pivot_table(
    index=['Classes_Dep_Var', 'Num_Categorical_Vars', 'Classes_Ind_Vars', 'Sample_Size'],
    columns='Model',
    values='Mean_Misclassification'
)

table5 = table5.rename(columns={
    'DecisionTree': 'DT',
    'LogisticRegression': 'LR',
    'RandomForest': 'RF'
})

table5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Model,ANN,DecisionTree,LogisticRegression,RandomForest
Classes_Dep_Var,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,1,2,100,0.200000,0.266667,0.200000,0.166667
2,1,2,500,0.266667,0.293333,0.253333,0.246667
2,1,2,1000,0.186667,0.266667,0.183333,0.236667
2,1,2,10000,0.198333,0.239333,0.201000,0.253000
2,1,3,100,0.233333,0.266667,0.300000,0.300000
...,...,...,...,...,...,...,...
4,4,2,10000,0.364667,0.390333,0.393667,0.473000
4,4,3,100,0.433333,0.433333,0.300000,0.333333
4,4,3,500,0.300000,0.440000,0.353333,0.466667
4,4,3,1000,0.343333,0.430000,0.400000,0.470000


In [21]:
table5.to_csv('../dat/table5.csv')

### Table 6. Experimental results: misclassification errors for ANN, DT and LR with seven independent variables, including categorical variables (V = 7).

In [22]:
df_7 = df[(df["Num_Categorical_Vars"] != 0) & (df['Num_Ind_Vars'] == 7)]
df_7

Unnamed: 0,Iteration,Classes_Dep_Var,Num_Ind_Vars,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Model,Hyperparameter_1,Hyperparameter_2,Hyperparameter_3,Misclassification,Accuracy,Precision,Recall,F1_Score
2280,1,2,7,1,2,100,LogisticRegression,logit,99.00,99,0.233333,0.766667,0.767857,0.766667,0.766407
2281,1,2,7,1,2,100,LogisticRegression,probit,99.00,99,0.233333,0.766667,0.767857,0.766667,0.766407
2282,1,2,7,1,2,100,DecisionTree,entropy,0.05,0.1,0.333333,0.666667,0.666667,0.666667,0.666667
2283,1,2,7,1,2,100,DecisionTree,entropy,0.05,0.2,0.333333,0.666667,0.666667,0.666667,0.666667
2284,1,2,7,1,2,100,DecisionTree,entropy,0.10,0.1,0.333333,0.666667,0.666667,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12307,1,4,7,6,3,10000,ANN,2,3.00,Adam,0.360667,0.639333,0.623412,0.639333,0.621167
12308,1,4,7,6,3,10000,ANN,2,3.00,RMSprop,0.345667,0.654333,0.697476,0.654333,0.662281
12309,1,4,7,6,3,10000,ANN,2,15.00,SGD,0.330000,0.670000,0.685962,0.670000,0.672361
12310,1,4,7,6,3,10000,ANN,2,15.00,Adam,0.528333,0.471667,0.399749,0.471667,0.393844


In [23]:
gpdrouped = df_7.groupby(['Classes_Dep_Var', 'Num_Categorical_Vars', 'Classes_Ind_Vars', 'Sample_Size', 'Model'])
result_7 = []

for group, frame in grouped:
    # Find the combination of hyperparameters that minimizes misclassification error
    best_hyperparams = frame.loc[frame['Misclassification'].idxmin(), ['Hyperparameter_1', 'Hyperparameter_2', 'Hyperparameter_3']]
    best_frame = frame[(frame['Hyperparameter_1'] == best_hyperparams['Hyperparameter_1']) &
                       (frame['Hyperparameter_2'] == best_hyperparams['Hyperparameter_2']) &
                       (frame['Hyperparameter_3'] == best_hyperparams['Hyperparameter_3'])]
    mean_misclassification = best_frame['Misclassification'].mean()

    result_7.append({
        'Classes_Dep_Var': group[0],
        'Num_Categorical_Vars': group[1],
        'Classes_Ind_Vars': group[2],
        'Sample_Size': group[3],
        'Model': group[4],
        'Mean_Misclassification': mean_misclassification
    })


results_7 = pd.DataFrame(result_7)

# Pivot table for the desired format
table6 = results_7.pivot_table(
    index=['Classes_Dep_Var', 'Num_Categorical_Vars', 'Classes_Ind_Vars', 'Sample_Size'],
    columns='Model',
    values='Mean_Misclassification'
)

table6 = table6.rename(columns={
    'DecisionTree': 'DT',
    'LogisticRegression': 'LR',
    'RandomForest': 'RF'
})

table6

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Model,ANN,DecisionTree,LogisticRegression,RandomForest
Classes_Dep_Var,Num_Categorical_Vars,Classes_Ind_Vars,Sample_Size,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,1,2,100,0.200000,0.333333,0.233333,0.300000
2,1,2,500,0.240000,0.320000,0.260000,0.293333
2,1,2,1000,0.210000,0.270000,0.213333,0.266667
2,1,2,10000,0.204667,0.254000,0.204333,0.252333
2,1,3,100,0.266667,0.400000,0.300000,0.233333
...,...,...,...,...,...,...,...
4,6,2,10000,0.371667,0.426667,0.402000,0.504000
4,6,3,100,0.466667,0.566667,0.366667,0.566667
4,6,3,500,0.353333,0.393333,0.393333,0.553333
4,6,3,1000,0.333333,0.393333,0.383333,0.483333


In [24]:
table6.to_csv('../dat/table6.csv')