In [None]:
library(readxl)
library(ggplot2)
library(caret)
library(glmnet)

In [None]:
df <- read_excel("C:/Users/Matias/OneDrive/Documentos/jupyter/CausalAI-Course/data/Districtwise_literacy_rates.xlsx")

In [None]:
df <- na.omit(df)

In [None]:
par(mfrow=c(1,2))
hist(df$FEMALE_LIT, breaks=20, col="pink", border="black",
     main="Distribución para mujeres", xlab="Tasa (%)", ylab="Frecuencia")
hist(df$MALE_LIT, breaks=20, col="skyblue", border="black",
     main="Distribución para hombres", xlab="Tasa (%)")

In [None]:
set.seed(123)
trainIndex <- createDataPartition(df$FEMALE_LIT, p=0.7, list=FALSE)
train <- df[trainIndex, ]
test  <- df[-trainIndex, ]

modelo_low_1 <- lm(FEMALE_LIT ~ MALE_LIT + SEXRATIO + P_URB_POP + GROWTHRATE, data=train)
summary(modelo_low_1)

pred_train <- predict(modelo_low_1, newdata=train)
pred_test  <- predict(modelo_low_1, newdata=test)

cat("Modelo low (opción 1) R² train:", R2(pred_train, train$FEMALE_LIT), "\n")
cat("Modelo low (opción 1) R² test:", R2(pred_test, test$FEMALE_LIT), "\n")

In [None]:
modelo_low_2 <- lm(FEMALE_LIT ~ MALE_LIT + SCHTOT + P_SC_POP, data=train)
summary(modelo_low_2)

pred_train2 <- predict(modelo_low_2, newdata=train)
pred_test2  <- predict(modelo_low_2, newdata=test)

cat("Modelo low (opción 2) R² train:", R2(pred_train2, train$FEMALE_LIT), "\n")
cat("Modelo low (opción 2) R² test:", R2(pred_test2, test$FEMALE_LIT), "\n")

In [None]:
formula_high <- FEMALE_LIT ~ (MALE_LIT + SEXRATIO + P_URB_POP + GROWTHRATE + TOTPOPULAT)^2 +
  I(MALE_LIT^2) + I(SEXRATIO^2) + I(P_URB_POP^2)

modelo_high <- lm(formula_high, data=train)
summary(modelo_high)

pred_train_high <- predict(modelo_high, newdata=train)
pred_test_high  <- predict(modelo_high, newdata=test)

cat("Modelo high R² train:", R2(pred_train_high, train$FEMALE_LIT), "\n")
cat("Modelo high R² test:", R2(pred_test_high, test$FEMALE_LIT), "\n")

In [None]:
y <- train$FEMALE_LIT
X <- model.matrix(formula_high, data=train)[, -1]  # quitar intercepto

set.seed(123)
lambdas <- 10^seq(-3, 4, length=100)

nonzero_counts <- numeric(length(lambdas))

for (i in seq_along(lambdas)) {
  fit <- glmnet(X, y, alpha=1, lambda=lambdas[i], standardize=TRUE, maxit=20000, thresh=1e-2)
  nonzero_counts[i] <- sum(coef(fit) != 0)
}

In [None]:
plot(lambdas, nonzero_counts, type="b", log="x", pch=19,
     xlab="Lambda (escala log)", ylab="Número de coeficientes ≠ 0",
     main="Trayectorias de LASSO")
grid()