In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [15]:
np.random.seed(42069)


def get_department_df(depname, wrate, accrate, n):
    return pd.DataFrame(
        {
            "department": depname,
            "is_woman": np.random.binomial(1, wrate, size=n),
            "is_accepted": np.random.binomial(1, accrate, size=n),
        }
    )


base_df = pd.concat(
    [
        get_department_df(name, wrate, accrate, np.random.randint(120, 720))
        for name, wrate, accrate in zip(
            "ABCDEF",
            [0.2, 0.3, 0.45, 0.15, 0.1, 0.12],
            [0.12, 0.03, 0.06, 0.2, 0.19, 0.21],
        )
    ]
).reset_index(drop=True)

## Yay, no discrimination:

In [20]:
sm.OLS(
    base_df["is_accepted"],
    pd.concat(
        [
            base_df.drop(["is_accepted", "department"], axis=1),
            pd.get_dummies(base_df["department"], prefix="dept"),
        ],
        axis=1,
    ),
).fit().summary()

0,1,2,3
Dep. Variable:,is_accepted,R-squared:,0.039
Model:,OLS,Adj. R-squared:,0.036
Method:,Least Squares,F-statistic:,13.61
Date:,"Mon, 08 Feb 2021",Prob (F-statistic):,3.33e-15
Time:,19:14:33,Log-Likelihood:,-709.57
No. Observations:,2011,AIC:,1433.0
Df Residuals:,2004,BIC:,1472.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
is_woman,0.0016,0.019,0.083,0.934,-0.036,0.040
dept_A,0.1223,0.016,7.528,0.000,0.090,0.154
dept_B,0.0535,0.029,1.843,0.065,-0.003,0.110
dept_C,0.0498,0.019,2.639,0.008,0.013,0.087
dept_D,0.2214,0.018,12.346,0.000,0.186,0.257
dept_E,0.1999,0.018,11.178,0.000,0.165,0.235
dept_F,0.2174,0.025,8.715,0.000,0.168,0.266

0,1,2,3
Omnibus:,653.561,Durbin-Watson:,1.978
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1500.795
Skew:,1.894,Prob(JB):,0.0
Kurtosis:,4.886,Cond. No.,2.29


## :( discrimination

In [21]:
sm.OLS(
    base_df["is_accepted"],
    base_df.drop(["is_accepted", "department"], axis=1).assign(constant=1),
).fit().summary()

0,1,2,3
Dep. Variable:,is_accepted,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,6.985
Date:,"Mon, 08 Feb 2021",Prob (F-statistic):,0.00828
Time:,19:38:58,Log-Likelihood:,-746.26
No. Observations:,2011,AIC:,1497.0
Df Residuals:,2009,BIC:,1508.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
is_woman,-0.0492,0.019,-2.643,0.008,-0.086,-0.013
constant,0.1555,0.009,17.447,0.000,0.138,0.173

0,1,2,3
Omnibus:,705.604,Durbin-Watson:,1.913
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1724.37
Skew:,2.014,Prob(JB):,0.0
Kurtosis:,5.085,Cond. No.,2.53


### Task:

add columns with generated data to `base_df` and causal explanations 4 different ways, 2 to convincingly agrue that discrimination exist, and 2 to argue that it doesn't exist, each in different ways