In [16]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

### Continuous Independent Variables

In [17]:
# load data and convert to categorical variables
table3 = pd.read_csv('../dat/table3.csv')
table3['S'] = table3['Sample_Size'].astype('category')
table3['CL_Y'] = table3['Classes_Dep_Var'].astype('category')
table3['V'] = table3['Num_Ind_Vars'].astype('category')

# Reshape data into long format
table3_long = table3.melt(id_vars=['CL_Y', 'V', 'S'], 
                          value_vars=['ANN', 'DT', 'LR'], 
                          var_name='Model', 
                          value_name='Score')

# Convert Model into categorical
table3_long['Model'] = table3_long['Model'].astype('category')

In [18]:
# Fit the ANOVA model with only main effects and two-way interactions
model = ols('Score ~ C(CL_Y) + C(V) + C(S) + C(Model) + \
             C(CL_Y):C(V) + C(CL_Y):C(S) + C(CL_Y):C(Model) + \
             C(V):C(S) + C(V):C(Model) + C(S):C(Model)', data=table3_long).fit()

# Perform ANOVA (Type II)
anova_results = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_results.round(4))

                  sum_sq    df         F  PR(>F)
C(CL_Y)           1.3895   2.0  707.2997  0.0000
C(V)              0.0246   2.0   12.5360  0.0000
C(S)              0.0150   3.0    5.0789  0.0031
C(Model)          0.1052   2.0   53.5352  0.0000
C(CL_Y):C(V)      0.0023   4.0    0.5945  0.6678
C(CL_Y):C(S)      0.0385   6.0    6.5339  0.0000
C(CL_Y):C(Model)  0.0036   4.0    0.9088  0.4639
C(V):C(S)         0.0706   6.0   11.9852  0.0000
C(V):C(Model)     0.0009   4.0    0.2229  0.9247
C(S):C(Model)     0.0102   6.0    1.7384  0.1254
Residual          0.0668  68.0       NaN     NaN


### Categorical Independent Variables (V=3)

In [20]:
# load data and convert to categorical variables
table4 = pd.read_csv('../dat/table4.csv')
table4['S'] = table4['Sample_Size'].astype('category')
table4['CL_Y'] = table4['Classes_Dep_Var'].astype('category')
table4['CA'] = table4['Num_Categorical_Vars'].astype('category')
table4['CL_X'] = table4['Classes_Ind_Vars'].astype('category')

# Reshape data into long format
table4_long = table4.melt(id_vars=['CL_Y', 'CL_X', 'S', 'CA'], 
                          value_vars=['ANN', 'DT', 'LR'], 
                          var_name='Model', 
                          value_name='Score')

# Convert Model into categorical
table4_long['Model'] = table4_long['Model'].astype('category')

In [21]:
model = ols('Score ~ C(CL_Y) + C(CL_X) + C(S) + C(CA) + C(Model) + \
             C(CL_Y):C(CL_X) + C(CL_Y):C(S) + C(CL_Y):C(CA) + C(CL_Y):C(Model) + \
             C(CL_X):C(S) + C(CL_X):C(CA) + C(CL_X):C(Model) + \
             C(S):C(CA) + C(S):C(Model) + C(CA):C(Model)', data=table4_long).fit()

# Perform ANOVA (Type II)
anova_results = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_results.round(4))

                  sum_sq     df         F  PR(>F)
C(CL_Y)           1.2611    2.0  490.1805  0.0000
C(CL_X)           0.0646    1.0   50.1851  0.0000
C(S)              0.0089    3.0    2.3119  0.0805
C(CA)             0.0574    1.0   44.6413  0.0000
C(Model)          0.0413    2.0   16.0685  0.0000
C(CL_Y):C(CL_X)   0.0001    2.0    0.0242  0.9761
C(CL_Y):C(S)      0.0444    6.0    5.7508  0.0000
C(CL_Y):C(CA)     0.0144    2.0    5.5843  0.0050
C(CL_Y):C(Model)  0.0089    4.0    1.7365  0.1476
C(CL_X):C(S)      0.0082    3.0    2.1336  0.1005
C(CL_X):C(CA)     0.0001    1.0    0.1073  0.7439
C(CL_X):C(Model)  0.0025    2.0    0.9589  0.3867
C(S):C(CA)        0.0114    3.0    2.9506  0.0362
C(S):C(Model)     0.0080    6.0    1.0414  0.4031
C(CA):C(Model)    0.0013    2.0    0.4915  0.6132
Residual          0.1325  103.0       NaN     NaN
