# Assigment 2

In [5]:
# install.packages("tidyverse")

In [170]:
library('tidyverse')

# 1. Loading and processing the data

### 1. Load the data, either from the url or from the path provided

In [171]:
data <- read_csv('/Users/gabriel/Documents/GitHub/CausalAI-Course/data/wage2015_subsample_inference.csv')
data <- select(data, -rownames)
dim(data)

[1mRows: [22m[34m5150[39m [1mColumns: [22m[34m21[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): rownames, wage, lwage, sex, shs, hsg, scl, clg, ad, mw, so, we, ne...

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### 2. As in Group Assignment 1 2024 - 2 #1044 , generate the extra-flexible model. This means that it contains all two-way interactions between the experience polynomials and the indicator variables

In [172]:
# Convertir las variables occ2 e ind2 a factores
data$occ2 <- factor(data$occ2)
data$ind2 <- factor(data$ind2)

In [173]:
X_extra_flexible <- model.matrix(~ sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)) ^ 2, data = data)

#### 2.1. Generate the array for the outcome variable Y and normalize it

In [174]:
Y <- data$lwage
Y <- (Y-mean(Y))/ sd(Y)
Y <- as.vector(Y)
length(Y)

#### 2.2. Generate the array for the predictors X (do not generate an intercept) and normalize its colums.

In [175]:
library(dplyr)

In [176]:
X <- model.matrix(~ 0 + sex + (exp1 + exp2 + exp3 + exp4 +
                    hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2))^2, data = data)
X <- as.data.frame(X)

vars_norm <- c("exp1", "exp2", "exp3", "exp4")

for (var in vars_norm) {
  X[[var]] <- (X[[var]] - mean(X[[var]])) / sd(X[[var]])
}
X <- as.matrix(X)
dim(X)

### 3. Split between training and testing samples. The testing sample should be 10% of the total.

In [177]:
data <- cbind(Y, X)

set.seed(123)

n <- nrow(data)
test_size <- floor(0.1 * n)

test_indices <- sample(seq_len(n), size = test_size)

train_sample <- data[-test_indices, ]
test_sample <- data[test_indices, ]

dim(train_sample)
dim(test_sample)

# 2. Creating the Lasso Cross-Validation Procedure

In [178]:
library(glmnet) 
library(caret)

### 4. Program a function that generates a logarithmically spaced grid. The input arguments should be the lower and upper bounds of the grid, as well as the natural logarithm of the spacing between each element of the grid. The output should be the logarithmically spaced grid, meaning that if we take the natural logarithm of each entry in the grid, they will be equally spaced. This will be the grid of values for λ values to try during cross-validation.

In [180]:
log_grid <- function(lower_bound, upper_bound, niter) {
  exp(seq(log(lower_bound), log(upper_bound), length.out = niter))
}

### 5. Program a function to generate k folds. It should take as input the array to be split rowwise and the number of folds desired. It should output a list of k 1d arrays of booleans; these arrays should all be the same length as the number of rows in the input array, and when they are all summed together they should add up to an array of all true values. Create your own procedure for splitting. You can aid yourself with third party packages like numpy in Python or Statistics in Julia, but do not use a pre-programmed third party splitting procedure like sk-learns's KFolds in Python.

In [181]:
generate_folds <- function(Y, k) {
  n <- length(Y)
  fold_indices <- sample(rep(1:k, length.out = n))
  folds <- lapply(1:k, function(i) {
    fold <- fold_indices == i
    return(fold)
  })
  return(folds)
}

### 6. Program a function that integrates those that you programmed in the last two items to find the value of λ that minimizes the testing mean square error across folds. It should take the following inputs:
- Y: an array for the outcome variable.
- X: an array of predictors.
- lambda_bounds: the lower and upper bounds of the grid of lambda values.
- k: number of folds

### The output should be a dictionary (a list in R) with the following entries:
- optimal_lambda: The lambda that minimizes the testing MSE across folds.
- optimal_coef: An array with the coefficients found for the optimal lambda
- all_lambdas: The grid of lambdas.
- all_mse: An array with the testing MSE across folds for each lambda.

### The procedure goes as follows: With each lambda in the grid -> For each split -> Train and test a lasso model.

With this, you can get the testing MSE associated with the value of lambda in each iteration. Feel free to use a third party Lasso estimator, as long as it is not one that calculates the optimal lambda value. Instead, the third party estimator you use should only estimate the regression with the penalty weight provided so you can get the testing MSE with that specific weight.

In [184]:
optimize_lambda <- function(Y, X, lambda_bounds, k) {
  lambdas <- log_grid(lambda_bounds[1], lambda_bounds[2], niter)
  folds <- generate_folds(Y, k)
  avg_mse <- numeric(length(lambdas))

  for (i in seq_along(lambdas)) {
    lambda <- lambdas[i]
    fold_pe <- numeric(k)

    for (j in seq_along(folds)) {
      fold <- folds[[j]]
      X_train <- X[!fold, ]
      Y_train <- Y[!fold]
      X_test <- X[fold, ]
      Y_test <- Y[fold]

      model <- glmnet(X_train, Y_train, alpha = 1, lambda = lambda)
      Y_pred <- predict(model, newx = X_test)
      fold_pe[j] <- sum((Y_test - Y_pred)^2)
    }

    avg_mse[i] <- mean(fold_pe)
  }

  optimal_lambda <- lambdas[which.min(avg_mse)]
  optimal_model <- glmnet(X, Y, alpha = 1, lambda = optimal_lambda)

  return(list(optimal_lambda = optimal_lambda,
              optimal_coef = coef(optimal_model),
              model = optimal_model,  # Guardar el modelo aquí
              all_lambdas = lambdas,
              all_mse = avg_mse))
}

### 7. Program a function for predicting the outcome variable through model estimated with the optimal lambda. It should take as inputs
- optimal_model: A dictionary with the values outputed by the function defined for the previous point.
- X: an array of predictors.

### The output should be an array of predicted values.

In [185]:
predict_outcome <- function(optimal_model, X) {
  
  lambda_min <- optimal_model$optimal_lambda
  predictions <- predict(optimal_model$model, newx = X, s = lambda_min)
  
  return(predictions)
}

Probando las funciones

In [186]:
k <- 5
lambda_bounds <- c(0.001, 100)
niter = 10
dim(X)
length(Y)
Model_opt <- optimize_lambda(Y, X, lambda_bounds, k)

In [187]:
Model_opt
dim(X)

$optimal_lambda
[1] 0.0129155

$optimal_coef
981 x 1 sparse Matrix of class "dgCMatrix"
                               s0
(Intercept)         -1.927849e-01
sex                 -9.563414e-02
exp1                 1.436376e-01
exp2                 .           
exp3                 .           
exp4                 .           
hsg                 -8.261459e-02
scl                  .           
clg                  3.786961e-01
ad                   5.942474e-01
so                   .           
we                   .           
ne                   .           
C(occ2)1             4.351887e-01
C(occ2)2             3.114924e-01
C(occ2)3             3.683006e-01
C(occ2)4             6.617894e-02
C(occ2)5             .           
C(occ2)6            -1.545589e-02
C(occ2)7             .           
C(occ2)8            -1.004318e-01
C(occ2)9             .           
C(occ2)10            1.757558e-01
C(occ2)11           -3.042777e-01
C(occ2)12            .           
C(occ2)13            .      

In [188]:
Predict_lambdaopt <- predict_outcome(Model_opt, X)
head(Predict_lambdaopt)

Unnamed: 0,s1
1,-0.38411354
2,0.92224224
3,-0.19706873
4,1.30077109
5,0.3109969
6,0.08060316


# 3. Applying the Lasso Cross-Validation Procedure

### 8. Fit a simple OLS model with the training sample.

In [164]:
data <- as.data.frame(data)
data

Unnamed: 0_level_0,Y,sex,exp1,exp2,exp3,exp4,hsg,scl,clg,ad,...,C(occ2)13:C(ind2)22,C(occ2)14:C(ind2)22,C(occ2)15:C(ind2)22,C(occ2)16:C(ind2)22,C(occ2)17:C(ind2)22,C(occ2)18:C(ind2)22,C(occ2)19:C(ind2)22,C(occ2)20:C(ind2)22,C(occ2)21:C(ind2)22,C(occ2)22:C(ind2)22
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,-1.24025455,1,-0.6372218,-0.6320884,-0.54475036,-0.46474562,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1.58141594,0,1.6249092,1.6473962,1.48769339,1.25600186,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.99522357,0,0.3995882,0.0552562,-0.16591024,-0.27312491,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.58882841,1,1.0593765,0.8075861,0.50998359,0.26049699,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0.68583521,1,0.7766101,0.4551658,0.16648072,-0.03161649,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,-0.89162874,1,-1.2027546,-0.7520613,-0.56835453,-0.46922907,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,-0.02502721,1,2.6617193,3.6544424,4.54498628,5.34373921,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,-0.02502721,0,2.1904420,2.6671656,2.92754810,3.03189576,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,-0.85184610,1,1.6249092,1.6473962,1.48769339,1.25600186,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,-0.02502721,1,-0.9199882,-0.7145697,-0.56400639,-0.46875270,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [167]:
str(train_sample)
train_sample <- as.data.frame(train_sample)

 num [1:4635, 1:981] -1.24 1.581 -0.995 -0.589 0.686 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:4635] "1" "2" "3" "4" ...
  ..$ : chr [1:981] "Y" "sex" "exp1" "exp2" ...


In [168]:
X_extra_flex <- paste("sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)) ^ 2")
formula_extra_flex <- as.formula(paste("Y", "~", X_extra_flex))
model_X_extra_flex_train <- model.matrix(formula_extra_flex, train_sample)
model_X_extra_flex_test <- model.matrix(formula_extra_flex, test_sample)
p_extra_flex <- dim(model_X_extra_flex_train)[2]


ERROR: Error in levels(x): objeto 'occ2' no encontrado


In [None]:
fit_lm_extraflexible <- lm(, data_train)
# Compute the Out-Of-Sample Performance
yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test)
cat("Basic model MSE (OLS): ", mean((y_test - yhat_lm_basic)^2)) # MSE OLS (basic model)