In [16]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

### Continuous Independent Variables

In [17]:
# load data and convert to categorical variables
table3 = pd.read_csv('../dat/table3.csv')
table3['S'] = table3['Sample_Size'].astype('category')
table3['CL_Y'] = table3['Classes_Dep_Var'].astype('category')
table3['V'] = table3['Num_Ind_Vars'].astype('category')

# Reshape data into long format
table3_long = table3.melt(id_vars=['CL_Y', 'V', 'S'], 
                          value_vars=['ANN', 'DT', 'LR'], 
                          var_name='Model', 
                          value_name='Score')

# Convert Model into categorical
table3_long['Model'] = table3_long['Model'].astype('category')

In [18]:
# Fit the ANOVA model with only main effects and two-way interactions
model = ols('Score ~ C(CL_Y) + C(V) + C(S) + C(Model) + \
             C(CL_Y):C(V) + C(CL_Y):C(S) + C(CL_Y):C(Model) + \
             C(V):C(S) + C(V):C(Model) + C(S):C(Model)', data=table3_long).fit()

# Perform ANOVA (Type II)
anova_results = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_results.round(4))

                  sum_sq    df         F  PR(>F)
C(CL_Y)           1.3895   2.0  707.2997  0.0000
C(V)              0.0246   2.0   12.5360  0.0000
C(S)              0.0150   3.0    5.0789  0.0031
C(Model)          0.1052   2.0   53.5352  0.0000
C(CL_Y):C(V)      0.0023   4.0    0.5945  0.6678
C(CL_Y):C(S)      0.0385   6.0    6.5339  0.0000
C(CL_Y):C(Model)  0.0036   4.0    0.9088  0.4639
C(V):C(S)         0.0706   6.0   11.9852  0.0000
C(V):C(Model)     0.0009   4.0    0.2229  0.9247
C(S):C(Model)     0.0102   6.0    1.7384  0.1254
Residual          0.0668  68.0       NaN     NaN


### Categorical Independent Variables (V=3)

In [20]:
# load data and convert to categorical variables
table4 = pd.read_csv('../dat/table4.csv')
table4['S'] = table4['Sample_Size'].astype('category')
table4['CL_Y'] = table4['Classes_Dep_Var'].astype('category')
table4['CA'] = table4['Num_Categorical_Vars'].astype('category')
table4['CL_X'] = table4['Classes_Ind_Vars'].astype('category')

# Reshape data into long format
table4_long = table4.melt(id_vars=['CL_Y', 'CL_X', 'S', 'CA'], 
                          value_vars=['ANN', 'DT', 'LR'], 
                          var_name='Model', 
                          value_name='Score')

# Convert Model into categorical
table4_long['Model'] = table4_long['Model'].astype('category')

In [21]:
model = ols('Score ~ C(CL_Y) + C(CL_X) + C(S) + C(CA) + C(Model) + \
             C(CL_Y):C(CL_X) + C(CL_Y):C(S) + C(CL_Y):C(CA) + C(CL_Y):C(Model) + \
             C(CL_X):C(S) + C(CL_X):C(CA) + C(CL_X):C(Model) + \
             C(S):C(CA) + C(S):C(Model) + C(CA):C(Model)', data=table4_long).fit()

# Perform ANOVA (Type II)
anova_results = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_results.round(4))

                  sum_sq     df         F  PR(>F)
C(CL_Y)           1.2611    2.0  490.1805  0.0000
C(CL_X)           0.0646    1.0   50.1851  0.0000
C(S)              0.0089    3.0    2.3119  0.0805
C(CA)             0.0574    1.0   44.6413  0.0000
C(Model)          0.0413    2.0   16.0685  0.0000
C(CL_Y):C(CL_X)   0.0001    2.0    0.0242  0.9761
C(CL_Y):C(S)      0.0444    6.0    5.7508  0.0000
C(CL_Y):C(CA)     0.0144    2.0    5.5843  0.0050
C(CL_Y):C(Model)  0.0089    4.0    1.7365  0.1476
C(CL_X):C(S)      0.0082    3.0    2.1336  0.1005
C(CL_X):C(CA)     0.0001    1.0    0.1073  0.7439
C(CL_X):C(Model)  0.0025    2.0    0.9589  0.3867
C(S):C(CA)        0.0114    3.0    2.9506  0.0362
C(S):C(Model)     0.0080    6.0    1.0414  0.4031
C(CA):C(Model)    0.0013    2.0    0.4915  0.6132
Residual          0.1325  103.0       NaN     NaN


### Categorical Independent Variables (V=5)

In [23]:
# load data and convert to categorical variables
table5 = pd.read_csv('../dat/table5.csv')
table5['S'] = table5['Sample_Size'].astype('category')
table5['CL_Y'] = table5['Classes_Dep_Var'].astype('category')
table5['CA'] = table5['Num_Categorical_Vars'].astype('category')
table5['CL_X'] = table5['Classes_Ind_Vars'].astype('category')

# Reshape data into long format
table5_long = table5.melt(id_vars=['CL_Y', 'CL_X', 'S', 'CA'], 
                          value_vars=['ANN', 'DT', 'LR'], 
                          var_name='Model', 
                          value_name='Score')

# Convert Model into categorical
table5_long['Model'] = table5_long['Model'].astype('category')

In [24]:
model = ols('Score ~ C(CL_Y) + C(CL_X) + C(S) + C(CA) + C(Model) + \
             C(CL_Y):C(CL_X) + C(CL_Y):C(S) + C(CL_Y):C(CA) + C(CL_Y):C(Model) + \
             C(CL_X):C(S) + C(CL_X):C(CA) + C(CL_X):C(Model) + \
             C(S):C(CA) + C(S):C(Model) + C(CA):C(Model)', data=table5_long).fit()

# Perform ANOVA (Type II)
anova_results = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_results.round(4))

                  sum_sq     df         F  PR(>F)
C(CL_Y)           3.1311    2.0  894.5935  0.0000
C(CL_X)           0.0120    1.0    6.8544  0.0094
C(S)              0.0434    3.0    8.2701  0.0000
C(CA)             0.3472    3.0   66.1390  0.0000
C(Model)          0.3828    2.0  109.3633  0.0000
C(CL_Y):C(CL_X)   0.0301    2.0    8.5897  0.0003
C(CL_Y):C(S)      0.0696    6.0    6.6282  0.0000
C(CL_Y):C(CA)     0.0164    6.0    1.5624  0.1591
C(CL_Y):C(Model)  0.0029    4.0    0.4171  0.7962
C(CL_X):C(S)      0.0200    3.0    3.8010  0.0109
C(CL_X):C(CA)     0.0008    3.0    0.1598  0.9233
C(CL_X):C(Model)  0.0064    2.0    1.8253  0.1635
C(S):C(CA)        0.0560    9.0    3.5578  0.0004
C(S):C(Model)     0.0234    6.0    2.2295  0.0413
C(CA):C(Model)    0.0086    6.0    0.8222  0.5537
Residual          0.4007  229.0       NaN     NaN


### Categorical Independent Variables (V=7)

In [25]:
# load data and convert to categorical variables
table6 = pd.read_csv('../dat/table6.csv')
table6['S'] = table6['Sample_Size'].astype('category')
table6['CL_Y'] = table6['Classes_Dep_Var'].astype('category')
table6['CA'] = table6['Num_Categorical_Vars'].astype('category')
table6['CL_X'] = table6['Classes_Ind_Vars'].astype('category')

# Reshape data into long format
table6_long = table6.melt(id_vars=['CL_Y', 'CL_X', 'S', 'CA'], 
                          value_vars=['ANN', 'DT', 'LR'], 
                          var_name='Model', 
                          value_name='Score')

# Convert Model into categorical
table6_long['Model'] = table6_long['Model'].astype('category')

In [26]:
model = ols('Score ~ C(CL_Y) + C(CL_X) + C(S) + C(CA) + C(Model) + \
             C(CL_Y):C(CL_X) + C(CL_Y):C(S) + C(CL_Y):C(CA) + C(CL_Y):C(Model) + \
             C(CL_X):C(S) + C(CL_X):C(CA) + C(CL_X):C(Model) + \
             C(S):C(CA) + C(S):C(Model) + C(CA):C(Model)', data=table6_long).fit()

# Perform ANOVA (Type II)
anova_results = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_results.round(4))

                  sum_sq     df          F  PR(>F)
C(CL_Y)           5.3489    2.0  2225.7898  0.0000
C(CL_X)           0.0203    1.0    16.8792  0.0000
C(S)              0.2789    3.0    77.3577  0.0000
C(CA)             0.4480    5.0    74.5748  0.0000
C(Model)          0.6426    2.0   267.4185  0.0000
C(CL_Y):C(CL_X)   0.0071    2.0     2.9527  0.0535
C(CL_Y):C(S)      0.0269    6.0     3.7331  0.0013
C(CL_Y):C(CA)     0.0318   10.0     2.6432  0.0040
C(CL_Y):C(Model)  0.0186    4.0     3.8625  0.0044
C(CL_X):C(S)      0.0437    3.0    12.1111  0.0000
C(CL_X):C(CA)     0.0649    5.0    10.8057  0.0000
C(CL_X):C(Model)  0.0034    2.0     1.4190  0.2433
C(S):C(CA)        0.0382   15.0     2.1181  0.0088
C(S):C(Model)     0.0135    6.0     1.8676  0.0855
C(CA):C(Model)    0.0059   10.0     0.4869  0.8984
Residual          0.4266  355.0        NaN     NaN
