In [1]:
%cd ..

/Users/danieloliveiradebrito/Projetos/causal-inference


## 1.1. Foundation of Linear Regression

Example 1.1.1: approx a Smooth Function with a Polynomail Dictionary

In [2]:
import numpy as np
import altair as alt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
def create_X(p: int, w: np.array) -> np.ndarray:

    X = w

    while p - 1 > 1:
        X = np.c_[X, (w ** (p - 1))] 
        p -= 1

    return sm.add_constant(X)

def fit_ols(X: np.array, g: np.array, w: np.array) -> pd.DataFrame:
    model = sm.OLS(g, X)
    res = model.fit()
    predicted = res.get_prediction().summary_frame()["mean"].values

    return pd.DataFrame(
        dict(
            w = w,
            y_true = g,
            y_pred = predicted
        )
    )

def plot_predictions(df: pd.DataFrame, p: int) -> alt.Chart:
    color_scale =  alt.Scale(
                    domain = ["y_true", "y_pred"],
                    range = ["blue", "orange"])


    return alt.Chart(
        df.melt(id_vars = "w"),
        title = f"p = {p}"
    ).mark_line().encode(x = alt.X("w", title = "w"),
                        y = alt.Y("value", title = "Values"),
                        color = alt.Color("variable", scale = color_scale, legend=None),
                        strokeDash = alt.condition(
                            alt.datum.variable == "y_pred",
                            alt.value([5, 5]),
                            alt.value([0, 0]),
                            legend=None),
                        ).configure_axis(grid = False)

In [4]:
w = np.arange(0, 1, .01)

g = np.exp(4 * w)

for p in [2, 3, 4]:
    X = create_X(p, w)

    chart = (plot_predictions(fit_ols(X, g, w), p))

    chart.display()

## 1.2. Statistical Properties

In [5]:
n = 100
p = int( n / 2)

y = np.random.normal(0, 1, size=[n, 1])
X = np.random.normal(0, scale=1, size=[n, p])

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.551
Model:,OLS,Adj. R-squared (uncentered):,0.102
Method:,Least Squares,F-statistic:,1.227
Date:,"Wed, 06 Mar 2024",Prob (F-statistic):,0.236
Time:,14:31:34,Log-Likelihood:,-95.695
No. Observations:,100,AIC:,291.4
Df Residuals:,50,BIC:,421.6
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1594,0.136,1.174,0.246,-0.113,0.432
x2,0.0451,0.152,0.297,0.768,-0.260,0.350
x3,0.0403,0.134,0.300,0.765,-0.230,0.310
x4,0.0834,0.122,0.682,0.498,-0.162,0.329
x5,0.0560,0.114,0.491,0.626,-0.173,0.285
x6,-0.1228,0.113,-1.088,0.282,-0.349,0.104
x7,0.3955,0.166,2.386,0.021,0.063,0.728
x8,0.2093,0.146,1.432,0.158,-0.084,0.503
x9,0.0855,0.133,0.642,0.524,-0.182,0.353

0,1,2,3
Omnibus:,0.263,Durbin-Watson:,1.707
Prob(Omnibus):,0.877,Jarque-Bera (JB):,0.438
Skew:,-0.066,Prob(JB):,0.803
Kurtosis:,2.704,Cond. No.,6.25


## 1.4. Application

### Prediction of Wages

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv"
df = pd.read_csv(file)

In [8]:
df.describe()

Unnamed: 0,wage,lwage,sex,shs,hsg,scl,clg,ad,mw,so,we,ne,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
count,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0,5150.0
mean,23.41041,2.970787,0.444466,0.023301,0.243883,0.278058,0.31767,0.137087,0.259612,0.296505,0.216117,0.227767,13.760583,3.018925,8.235867,25.118038,5310.737476,11.670874,6629.154951,13.316893
std,21.003016,0.570385,0.496955,0.150872,0.429465,0.448086,0.465616,0.343973,0.438464,0.456761,0.411635,0.419432,10.609465,4.000904,14.488962,53.530225,11874.35608,6.966684,5333.443992,5.701019
min,3.021978,1.105912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,370.0,2.0
25%,13.461538,2.599837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.25,0.125,0.0625,1740.0,5.0,4880.0,9.0
50%,19.230769,2.956512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,1.0,1.0,4040.0,13.0,7370.0,14.0
75%,27.777778,3.324236,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,21.0,4.41,9.261,19.4481,5610.0,17.0,8190.0,18.0
max,528.845673,6.270697,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,47.0,22.09,103.823,487.9681,100000.0,22.0,100000.0,22.0


In [9]:
df.shape

(5150, 20)

In [10]:
Y = np.log(df["wage"])
Z = df.drop(columns=["wage", "lwage"])
Z.shape

(5150, 18)

In [11]:
train, test = train_test_split(df, test_size = 0.20, random_state=123)

#### In sample

In [12]:
#" + ".join(Z.columns)

model_base = "lwage ~ sex + exp1 + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2)"
base = smf.ols(model_base, data=train)
results_base = base.fit()

rsquared_base = results_base.rsquared
rsquared_adj_base = results_base.rsquared_adj
mse_base = np.mean(results_base.resid ** 2)
mse_adj_base = results_base.mse_resid

print(f"R2 = {rsquared_base:.4f}")
print(f"R2 ajustado = {rsquared_adj_base:.4f}")
print(f"MSE = {mse_base:.4f}")
print(f"MSE ajustado = {mse_adj_base:.4f}")


R2 = 0.3176
R2 ajustado = 0.3092
MSE = 0.2202
MSE ajustado = 0.2229


In [13]:
#" + ".join(Z.columns)

flexible_model = "lwage ~ sex + shs + hsg + scl + C(occ2) + C(ind2) + mw + so + we  + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)"
flex = smf.ols(flexible_model, data=train)
results_flex = flex.fit()

rsquared_flex = results_flex.rsquared
rsquared_adj_flex = results_flex.rsquared_adj
mse_flex = np.mean(results_flex.resid ** 2)
mse_adj_flex = results_flex.mse_resid

print(f"R2 = {rsquared_flex:.4f}")
print(f"R2 ajustado = {rsquared_adj_flex:.4f}")
print(f"MSE = {mse_flex:.4f}")
print(f"MSE ajustado = {mse_adj_flex:.4f}")


R2 = 0.3643
R2 ajustado = 0.3241
MSE = 0.2051
MSE ajustado = 0.2181


In [14]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [15]:
X = flex.data.exog[:, 1:]
y = flex.data.endog

lasso = Pipeline([
    ("scale", StandardScaler()),
    ("lasso", LassoCV())
])
lasso.fit(X, y)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [16]:
n, p = X.shape
p += 1
mse_lasso = np.mean((y - lasso.predict(X)) ** 2)
mse_adj_lasso = mse_lasso * n / (n - p)
rsquared_lasso = 1 - mse_lasso / np.var(y)
rsquared_adj_lasso = 1 - mse_adj_lasso / np.var(y)

print(f"R2 = {rsquared_lasso:.4f}")
print(f"R2 ajustado = {rsquared_adj_lasso:.4f}")
print(f"MSE = {mse_lasso:.4f}")
print(f"MSE ajustado = {mse_adj_lasso:.4f}")

R2 = 0.3309
R2 ajustado = 0.2885
MSE = 0.2159
MSE ajustado = 0.2296


In [17]:
res_df = pd.DataFrame(
    dict(
        model = ["Basic reg", "Flexible reg", "Flexible Lasso"],
        p = [
            results_base.params.shape[0],
            results_flex.params.shape[0],
            results_flex.params.shape[0]
        ],
        r2 = [
            rsquared_base,
            rsquared_flex,
            rsquared_lasso
        ],
        mse = [mse_base, mse_flex, mse_lasso],
        adj_r2 = [rsquared_adj_base, rsquared_adj_flex, rsquared_adj_lasso],
        adj_mse = [mse_adj_base, mse_adj_flex, mse_adj_lasso]
    )
)

res_df

Unnamed: 0,model,p,r2,mse,adj_r2,adj_mse
0,Basic reg,51,0.317622,0.220187,0.309237,0.222947
1,Flexible reg,246,0.364346,0.20511,0.324146,0.218135
2,Flexible Lasso,246,0.330946,0.215888,0.288461,0.229597


#### Out of sample

In [18]:
tmp = smf.ols(model_base, data=df)
X = tmp.data.exog
y = tmp.data.endog
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)

In [19]:
reg_basic = sm.OLS(y_train, X_train).fit()
yhat_reg_base = reg_basic.predict(X_test)

MSE_test1 = sum((y_test - yhat_reg_base) ** 2) / y_test.shape[0]
R2_test1 = 1 - MSE_test1 / np.var(y_test)

print(f"Test MSE for basic model = {MSE_test1:.2f}")
print(f"Test R2 for basic model = {R2_test1:.2f}")

Test MSE for basic model = 0.22
Test R2 for basic model = 0.30


In [20]:
tmp = smf.ols(flexible_model, data=df)
X = tmp.data.exog
y = tmp.data.endog
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)

reg_flex = sm.OLS(y_train, X_train).fit()
yhat_reg_flex = reg_flex.predict(X_test)

MSE_test2 = np.mean((y_test - yhat_reg_flex) ** 2)
R2_test2 = 1 - MSE_test2 / np.var(y_test)

print(f"Test MSE for basic model = {MSE_test2:.4f}")
print(f"Test R2 for basic model = {R2_test2:.4f}")

Test MSE for basic model = 0.2742
Test R2 for basic model = 0.1204


In [21]:
lasso = Pipeline([("scale", StandardScaler()), ("lasso", LassoCV())])
lasso.fit(X_train[:, 1:], y_train)

yhat_reg_lasso = lasso.predict(X_test[:, 1:])

MSE_test3 = np.mean((y_test - yhat_reg_lasso)**2)
R2_test3 = 1. - MSE_test3 / np.var(y_test)

print(f"Test MSE for the lasso model: {MSE_test3}")
print(f"Test R2 for the lasso model: {R2_test3}")

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Test MSE for the lasso model: 0.22594930005846384
Test R2 for the lasso model: 0.275292858650252


In [22]:
res_df = pd.DataFrame(
    dict(
        model = ["basic reg", "flexbiel reg", "flexible lasso"],
        mse_test = [MSE_test1, MSE_test2, MSE_test3],
        r2_test = [R2_test1, R2_test2, R2_test3]
    )
)

res_df

Unnamed: 0,model,mse_test,r2_test
0,basic reg,0.215955,0.295597
1,flexbiel reg,0.274243,0.120397
2,flexible lasso,0.225949,0.275293


### Wage Gab based on Sex

https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/PM1/python-ols-and-lasso-for-wage-gap-inference.ipynb

In [23]:
file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv"
df = pd.read_csv(file)

In [31]:
# male: sex == 0
# female: sex == 1

df.groupby("sex").agg("mean")

Unnamed: 0_level_0,wage,lwage,shs,hsg,scl,clg,ad,mw,so,we,ne,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,24.019261,2.98783,0.031807,0.294303,0.273331,0.293953,0.106606,0.259,0.298148,0.220902,0.22195,13.783992,2.991466,8.100175,24.598909,6121.125481,12.700454,6061.956309,12.03705
1,22.649413,2.949485,0.012669,0.180865,0.283967,0.347313,0.175186,0.260376,0.294452,0.210135,0.235037,13.731324,3.053246,8.405467,25.766892,4297.840979,10.38401,7338.091306,14.916557


In [38]:
#female - male
df.query("sex==1")["lwage"].mean() - df.query("sex==0")["lwage"].mean()

-0.03834473367441449

In [40]:
smf.ols("lwage ~ sex", data=df).fit().summary()

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,5.752
Date:,"Wed, 06 Mar 2024",Prob (F-statistic):,0.0165
Time:,14:49:12,Log-Likelihood:,-4412.7
No. Observations:,5150,AIC:,8829.0
Df Residuals:,5148,BIC:,8843.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9878,0.011,280.316,0.000,2.967,3.009
sex,-0.0383,0.016,-2.398,0.017,-0.070,-0.007

0,1,2,3
Omnibus:,236.523,Durbin-Watson:,1.733
Prob(Omnibus):,0.0,Jarque-Bera (JB):,406.245
Skew:,0.376,Prob(JB):,6.0899999999999995e-89
Kurtosis:,4.153,Cond. No.,2.51


In [55]:
(alt.Chart(df.sample(5_000),
          title="Log wage for male (sex=0) and female (sex=1)")
    .mark_boxplot()
    .encode(x="sex:N", y="lwage")
    .properties(width=200, height=200)
)

In [81]:
(alt.Chart(df.sample(5_000),
          title="Log wage for male (sex=0) and female (sex=1)")
    .transform_density("lwage", as_=["lwage", "density"], extent=[0, 5], groupby=["sex"])
    .mark_area(orient="horizontal")
    .encode(alt.X("density:Q").stack("center").impute(None).title(None).axis(labels=False, values=[0], grid=False, ticks=True),
            alt.Y("lwage:Q"),
            alt.Column("sex").header(titleOrient="bottom", labelOrient="bottom", labelPadding=0),
            alt.Color("sex:N"))
    .configure_view(stroke=None)
)

Flexible model

In [82]:
flex = "lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)"

control_fit = smf.ols(flex, data=df).fit()
control_est = control_fit.params["sex"]
control_se = control_fit.HC3_se["sex"]

control_fit.summary()

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.351
Model:,OLS,Adj. R-squared:,0.319
Method:,Least Squares,F-statistic:,10.83
Date:,"Wed, 06 Mar 2024",Prob (F-statistic):,2.69e-305
Time:,15:05:00,Log-Likelihood:,-3301.9
No. Observations:,5150,AIC:,7096.0
Df Residuals:,4904,BIC:,8706.0
Df Model:,245,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.8603,0.429,9.006,0.000,3.020,4.701
C(occ2)[T.2],0.1613,0.130,1.244,0.214,-0.093,0.416
C(occ2)[T.3],0.2102,0.169,1.246,0.213,-0.121,0.541
C(occ2)[T.4],0.0709,0.184,0.386,0.700,-0.289,0.431
C(occ2)[T.5],-0.3960,0.189,-2.100,0.036,-0.766,-0.026
C(occ2)[T.6],-0.2311,0.187,-1.236,0.217,-0.598,0.135
C(occ2)[T.7],0.3147,0.194,1.621,0.105,-0.066,0.695
C(occ2)[T.8],-0.1875,0.169,-1.108,0.268,-0.519,0.144
C(occ2)[T.9],-0.3390,0.167,-2.027,0.043,-0.667,-0.011

0,1,2,3
Omnibus:,395.012,Durbin-Watson:,1.898
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1529.25
Skew:,0.303,Prob(JB):,0.0
Kurtosis:,5.6,Cond. No.,99900.0


In [86]:
print(f"The estimated sex coefficient is {control_est:.4f} with robust SE of {control_se:.4f}")

The estimated sex coefficient is -0.0696 with robust SE of 0.0157


In [100]:
flex_without_sex = "lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)"
lm0 = smf.ols(flex_without_sex, data=df.query("sex==0"))
lm1 = smf.ols(flex_without_sex, data=df.query("sex==1"))

XX0 = lm0.exog
y0 = lm0.endog
XX1 =  lm1.exog
y1 = lm1.endog

betarest = control_fit.params[control_fit.params.index!="sex"][1:]

print(f"Marginal gap = {y1.mean() - y0.mean()}")
#diff da media das fts (exclui intercept '[1:]')
diff_explained = betarest.dot(XX1.mean(0)[1:] - XX0.mean(0)[1:])
print(f"Explained difference: {diff_explained}")
print(f"Unexplained difference: {control_est}")
print(f"Sum of differences: {diff_explained + control_est}")

Marginal gap = -0.03834473367441449
Explained difference: 0.03120846962241225
Unexplained difference: -0.06955320329684149
Sum of differences: -0.03834473367442924


Frisch-Waugh-Lovell

In [101]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

flex_interact_sex = "lwage ~ sex*((exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we))"
lmi = smf.ols(flex_interact_sex, data=df)

vfi = np.array([variance_inflation_factor(lmi.exog, i) for i in range(1, lmi.exog.shape[1])])


  vif = 1. / (1. - r_squared_i)


In [121]:
# Concatena False para remover intercept
np.array(lmi.exog_names)[np.concatenate(((False, ), vfi>1e8))]

array(['C(occ2)[T.18]', 'sex:C(occ2)[T.18]', 'exp1:C(occ2)[T.18]',
       'exp2:C(occ2)[T.18]', 'exp3:C(occ2)[T.18]', 'exp4:C(occ2)[T.18]',
       'sex:exp1:C(occ2)[T.18]', 'sex:exp2:C(occ2)[T.18]', 'sex:exp3',
       'sex:exp3:C(occ2)[T.18]', 'sex:exp4:C(occ2)[T.18]'], dtype='<U22')

In [123]:
flex_y = "lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)"
flex_d = "sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)"

t_Y = smf.ols(flex_y, data=df).fit().resid
t_D = smf.ols(flex_d, data=df).fit().resid

partial_fit = sm.OLS(t_Y, t_D).fit()
partial_est = partial_fit.params["x1"]

print(f"Coefficient for D via partialling-out {partial_est:.4f}")

partial_se = partial_fit.HC3_se["x1"]
print(f"95% CI = {partial_fit.conf_int().values[0]}")

Coefficient for D via partialling-out -0.0696
95% CI = [-0.09866859 -0.04043781]


In [147]:
chart = alt.Chart(pd.DataFrame(
    dict(
        y_resid = t_Y,
        d_resid = t_D
    )
).sample(5_000),
title="Linear regression Y_resid ~ D_resid").mark_point(opacity=0.1).encode(y="y_resid", x="d_resid")

chart + chart.transform_regression("d_resid", "y_resid").mark_line()

## Study Questions

### 1. Sample splitting to evaluate the performance of predictions

The concept of sample splitting for evaluating prediction performance involves dividing the data into two sets: one for training the model and another for testing it. This allows us to gauge how well our trained model performs on unseen data, or out-of-sample data, providing an estimation of its generalization capability. This step is crucial in machine learning, especially for models with high flexibility, as they may inadvertently learn from noise patterns in the training data that do not generalize well to new data.

Consider a scenario where we have a linear relationship $Y_i = \beta X_i + \epsilon_i$, with $\epsilon \sim N(0, 1)$. In this case, training a highly flexible model could lead it to capture the noise from $\epsilon$ rather than the true underlying pattern. This model would have a perfect fit on the training data ($MSE = 0$), but a terrible performance on out-of-sample data.

In [196]:
true_beta = 2
x = np.arange(0, 5, step = .05)
n_samples = len(x)
random_noise = np.random.normal(0, 1, n_samples)
y = 5 + true_beta * x + random_noise


In [197]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(x.reshape(-1, 1), y)

lm = sm.OLS(y, sm.add_constant(x.reshape(-1, 1))).fit()


df_pred = pd.DataFrame(dict(x=x, y=y, y_lm = lm.predict(sm.add_constant(x.reshape(-1, 1))), y_dt = dt_reg.predict(x.reshape(-1, 1))))

base = alt.Chart(df_pred, title=f"Compara OLS (valor estimado de {lm.params[1]:.4f}) com DT")


base.mark_point().encode(alt.X("x"), alt.Y("y")) + base.mark_line().encode(alt.X("x"), alt.Y("y_dt")) + base.mark_line(color="red", strokeDash=[2, 2]).encode(alt.X("x"), alt.Y("y_lm"))

### 2. Wage gap analysis, focusing on subset of college-educated workers

In [3]:
file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv"
df = pd.read_csv(file)
df_college = df.query("clg == 1")

In [11]:
flex_y = "lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+C(occ2)+C(ind2)+mw+so+we)"
flex_d = "sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+C(occ2)+C(ind2)+mw+so+we)"

t_Y = smf.ols(flex_y, data=df_college).fit().resid
t_D = smf.ols(flex_d, data=df_college).fit().resid

partial_fit = sm.OLS(t_Y, t_D).fit()
partial_est = partial_fit.params["x1"]

print(f"Coefficient for D via partialling-out {partial_est:.4f}")

partial_se = partial_fit.HC3_se["x1"]
print(f"95% CI = {partial_fit.conf_int().values[0]}")

Coefficient for D via partialling-out -0.0462
95% CI = [-0.09291928  0.00047468]


In [13]:
chart = alt.Chart(pd.DataFrame(
    dict(
        y_resid = t_Y,
        d_resid = t_D
    )
), title="Linear regression Y_resid ~ D_resid (college educated)").mark_point(opacity=.2).encode(x=alt.X("d_resid"), y=alt.Y("y_resid"))

#chart + chart.transform_regression("d_resid", "y_resid").mark_line()
chart + chart.transform_loess("d_resid", "y_resid").mark_line()

### 3. Linear regression with $n = 2$, $p = 1$ ($Y_i = \hat{\beta}X_i + \hat{\epsilon_i}$). Geometric interpretation of $\hat Y$

https://www.youtube.com/watch?v=PbyP3goun2Y