In [1]:
data <- read.csv("https://raw.githubusercontent.com/d2cml-ai/CausalAI-Course/main/data/wage2015_subsample_inference.csv")

7. In some situations it can be useful to generate arrays (matrices) in R. However, for regressions, it is standard to use formulas. We will still be generating collections and matrices just to get used to doing so in R

In [2]:
y <- data$lwage

8.
8.1

In [3]:
X_basic <- model.matrix(~ sex + hsg + scl + clg + ad + so + we + ne + exp1 + C(occ2) + C(ind2), data = data)

8.2

In [4]:
X_flexible <- model.matrix(~ sex + (exp1 + exp2 + exp3 + exp4) * (hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)), data = data)

8.3

In [5]:
X_extra_flexible <- model.matrix(~ sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)) ^ 2, data = data)

9.

In [6]:
train_sample <- runif(length(y)) < 0.8

In [7]:
test_sample <- !train_sample

10.

In [8]:
basic_model <- lm(lwage ~ sex + hsg + scl + clg + ad + so + we + ne + exp1 + C(occ2) + C(ind2), data = data, subset = train_sample)

In [9]:
flexible_model <- lm(lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)), data = data, subset = train_sample)

In [10]:
extra_flexible_model <- lm(lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)) ^ 2, data = data, subset = train_sample)

11.

In [22]:
mse_train_basic <- mean(basic_model$residuals ^ 2)
r2_train_basic <- 1 - mse_train_basic / var(y[train_sample])
adjr2_train_basic <- 1 - length(y) / (length(y) - ncol(X_basic)) * mse_train_basic / var(y)
mse_test_basic <- mean((predict(basic_model, subset(data, subset = test_sample)) - y[test_sample]) ^ 2)
r2_test_basic <- 1 - mse_test_basic / var(y[test_sample])

print(paste("Training MSE for basic model:", mse_train_basic))
print(paste("Training R2 for basic model:", r2_train_basic))
print(paste("Adjusted training R2 for basic model:", adjr2_train_basic))
print(paste("Testing MSE for basic model:", mse_test_basic))
print(paste("Testing R2 for basic model:", r2_test_basic))

"contrasts dropped from factor C(occ2)"
"contrasts dropped from factor C(ind2)"


[1] "Training MSE for basic model: 0.228388454971171"
[1] "Training R2 for basic model: 0.297669962190106"
[1] "Adjusted training R2 for basic model: 0.290976708113303"
[1] "Training MSE for basic model: 0.211464023825894"
[1] "Training R2 for basic model: 0.351842534553317"


In [23]:
mse_train_flexible <- mean(flexible_model$residuals ^ 2)
r2_train_flexible <- 1 - mse_train_flexible / var(y)
adjr2_train_flexible <- 1 - length(y) / (length(y) - ncol(X_flexible)) * mse_train_flexible / var(y)
mse_test_flexible <- mean((predict(flexible_model, subset(data, subset = test_sample)) - y[test_sample]) ^ 2)
r2_test_flexible <- 1 - mse_test_flexible / var(y[test_sample])

print(paste("Training MSE for flexible model:", mse_train_flexible))
print(paste("Training R2 for flexible model:", r2_train_flexible))
print(paste("Adjusted training R2 for flexible model:", adjr2_train_flexible))
print(paste("Testing MSE for flexible model:", mse_test_flexible))
print(paste("Testing R2 for flexible model:", r2_test_flexible))

"contrasts dropped from factor C(occ2)"
"contrasts dropped from factor C(ind2)"


[1] "Training MSE for flexible model: 0.21151321437185"
[1] "Training R2 for flexible model: 0.349867848727106"
[1] "Adjusted training R2 for flexible model: 0.317255183716272"
[1] "Testing MSE for flexible model: 0.225423023234478"
[1] "Testing R2 for flexible model: 0.309056865799144"


In [24]:
mse_train_extra_flexible <- mean(extra_flexible_model$residuals ^ 2)
r2_train_extra_flexible <- 1 - mse_train_extra_flexible / var(y)
adjr2_train_extra_flexible <- 1 - length(y) / (length(y) - ncol(X_extra_flexible)) * mse_train_extra_flexible / var(y)
mse_test_extra_flexible <- mean((predict(extra_flexible_model, subset(data, subset = test_sample)) - y[test_sample]) ^ 2)
r2_test_extra_flexible <- 1 - mse_test_extra_flexible / var(y[test_sample])

print(paste("Training MSE for flexible model:", mse_train_extra_flexible))
print(paste("Training R2 for flexible model:", r2_train_extra_flexible))
print(paste("Adjusted training R2 for flexible model:", adjr2_train_extra_flexible))
print(paste("Testing MSE for flexible model:", mse_test_extra_flexible))
print(paste("Testing R2 for flexible model:", r2_test_extra_flexible))

"contrasts dropped from factor C(occ2)"
"contrasts dropped from factor C(ind2)"
"prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases"


[1] "Training MSE for flexible model: 0.172215806128951"
[1] "Training R2 for flexible model: 0.470657032685561"
[1] "Adjusted training R2 for flexible model: 0.34625508832869"
[1] "Testing MSE for flexible model: 0.291440620841393"
[1] "Testing R2 for flexible model: 0.10670661271303"
