In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [18]:
np.random.seed(42)

In [19]:
departments = np.random.choice(['Marketing', 'Logistics', 'Sales'], 50)
experience_levels = np.random.choice(['Junior', 'Mid', 'Senior'], 50)

In [20]:
salary = np.random.normal(loc=50000, scale=5000, size=50) + \
         (np.where(departments == 'Marketing', 2000, 0)) + \
         (np.where(departments == 'Sales', -3000, 0)) + \
         (np.where(experience_levels == 'Senior', 5000, 0)) + \
         (np.where(experience_levels == 'Junior', -5000, 0))

In [21]:
df = pd.DataFrame({
    'Department': departments,
    'Experience': experience_levels,
    'Salary': salary
})

In [22]:
df.head()

Unnamed: 0,Department,Experience,Salary
0,Sales,Mid,49910.613974
1,Marketing,Junior,51438.742298
2,Sales,Mid,51471.661651
3,Sales,Junior,45774.988986
4,Marketing,Mid,50964.170549


In [23]:
df_department = df.groupby("Department")

In [24]:
df_department.Salary.mean()

Department
Logistics    48198.185959
Marketing    54369.758458
Sales        46454.917331
Name: Salary, dtype: float64

In [25]:
model = ols('Salary ~ C(Department) + C(Experience) + C(Department):C(Experience)', data=df).fit()

In [26]:
anova_table = sm.stats.anova_lm(model, typ=2)

In [27]:
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Department),354460800.0,2.0,7.097703,0.002255
C(Experience),587032800.0,2.0,11.754713,9.2e-05
C(Department):C(Experience),101678100.0,4.0,1.017999,0.409391
Residual,1023774000.0,41.0,,
