# Multiconilearity


Multicollinearity occurs when predictor variables in a regression model are highly correlated, making it difficult to distinguish their individual effects. In regression analysis, multicollinearity complicates coefficient estimation and interpretation because it can dim unique contribution of each predictor variable.

## Examples:

- Temperature and humidity correlate; predicting rain probability with both of these two can lead to conilearity.
- Gender and income can correlate, leading to conilearity in specific regressions.


## Finding inverse of a matrix with linear combination column in R


In [1]:
set.seed(12)

# Crear una matriz vacía para almacenar los valores aleatorios
matriz <- matrix(NA, nrow = 10, ncol = 9)

# Rellenar la matriz con valores aleatorios
for (i in 1:10) {
  for (j in 1:9) {
    matriz[i, j] <- sample(0:99, 1)  # Genera un número aleatorio entre 0 y 99
  }
}

# Calcular la suma de las columnas 1, 5 y 7
suma_columnas <- matriz[, 2] + matriz[, 6] + matriz[, 8]

# Agregar la columna de suma a la matriz
matriz <- cbind(matriz, suma_columnas)

solve(matriz)

ERROR: Error in solve.default(matriz): sistema es computacionalmente singular: número de condición recíproco = 3.94281e-18


### Differences between Python, R and Julia

The principal difference is that Python and Julia give a result while R gives the following error : "Error in solve.default(matriz): sistema es computacionalmente singular: número de condición recíproco = 3.94281e-18"
This could be caused due to Python and Julia giving pseudoinverses as results, opposed to R that just tries to find the normal inverse.


# Analyzing RCT data with Precision Adjustment

In [None]:
install.packages("lmtest")
install.packages("sandwich")
install.packages("hdm")
#Replicacion del lab en R
# Primer bullet point
Penn <- as.data.frame(read.table("C:/Users/Frank/Downloads/penn_jae.dat", header=TRUE))
# el grupo de control y el grupo de tratamiento 2
Penn <- subset(Penn, tg == 2 | tg == 0)
attach(Penn)
T2 <- (tg == 2)

#Segundo bullet point

par(mfrow=c(1, 2))

# Histograma para el grupo de control (tg == 0)
hist(inuidur1[tg == 0], main="Histograma de inuidur1 (Control)", 
     xlab="Duración del Desempleo", col="blue", border="black")

# Histograma para el grupo de tratamiento 2 (tg == 2)
hist(inuidur1[tg == 2], main="Histograma de inuidur1 (Tratamiento2)", 
     xlab="Duración del Desempleo", col="red", border="black")

par(mfrow=c(1, 1))

#Tercer bullet point
# Ajustamos el modelo de regresión lineal (CRA)
m <- lm(T2 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data=Penn)
coeftest(m, vcov = vcovHC(m, type="HC1"))
X <- model.matrix(m)
save(X, file = "C:/Users/Frank/Downloads/m_reg.RData")
library(lmtest)
library(sandwich)

# Enfoque clásico de 2 muestras sin ajuste (CL)
formula_cl <- log(inuidur1) ~ T2
ols.cl_reg <- lm(formula_cl, data=Penn)
ols.cl <- coeftest(ols.cl_reg, vcov = vcovHC(ols.cl_reg, type="HC1"))
print(ols.cl)

# Ajuste de regresión lineal clásica (CRA)
formula_cra <- log(inuidur1) ~ T2 + (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2
ols.cra_reg <- lm(formula_cra, data=Penn)
ols.cra <- coeftest(ols.cra_reg, vcov = vcovHC(ols.cra_reg, type="HC1"))
print(ols.cra)

# Modelo de regresión interactivo (IRA)
X <- model.matrix(~(female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data=Penn)
demean <- function(x) { x - mean(x) }
X = apply(X, 2, demean)

ols.ira_reg <- lm(log(inuidur1) ~ T2 * X, data=Penn)
ols.ira <- coeftest(ols.ira_reg, vcov = vcovHC(ols.ira_reg, type="HC1"))
print(ols.ira)

# Guardamos la matriz del modelo interactivo
save(A, file = "C:/Users/Frank/Downloads/ols_ira_reg.RData")

# Modelo de regresión interactivo usando Lasso (IRA con Lasso)

Penn <- subset(Penn, tg == 2 | tg == 0)
Penn$T22 <- as.numeric(Penn$tg == 2)
Penn$T22 <- Penn$T22 - mean(Penn$T22)
X <- model.matrix(~ female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd, data = Penn)

# DX como interacción de T22 y X, eliminando el intercepto
DX <- model.matrix(~ Penn$T22 * X - 1, data = Penn)

install.packages("hdm")
library(hdm)

rlasso.ira <- summary(rlassoEffects(DX, log(Penn$inuidur1), index = 1))
print(rlasso.ira)
save(DX, file = "C:/Users/Frank/Downloads/rlasso_ira_reg.RData")

#cuarto Bullet point
X <- model.matrix(~(female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data=Penn)
demean <- function(x) { x - mean(x) }
X = apply(X, 2, demean)

ols.ira_reg <- lm(log(inuidur1) ~ T2 * X, data=Penn)
ols.ira <- coeftest(ols.ira_reg, vcov = vcovHC(ols.ira_reg, type="HC1"))
print(ols.ira)

coef_ira <- summary(ols.ira_reg)$coefficients
print (coef_ira)
# Filtramos solo los coeficientes de interés
interest_vars <- c("T2:Xfemale", "T2:Xblack", "T2:Xagelt35", "T2:Xfactor(dep)1")
coef_interest <- coef_ira[interest_vars, ]

# Utilizamos barplot para graficar los coeficientes
barplot(coef_interest[, "Estimate"], main = "Coeficientes de IRA para Variables Interactuadas",
        ylab = "Coeficiente", col = c("purple", "red", "pink", "brown"), names.arg = interest_vars)

# A Crash Course in Good and Bad Controls

In [2]:
library(dagitty)
library(lmtest)

# Model 1

sprinkler <- dagitty("dag {
    z -> y
    z -> x
    x -> y
}")
plot(sprinkler)

set.seed(420)
n <- 1000
z <- rnorm(n, mean = 0, sd = 1)
x <- 0.42 * z + rnorm(n, mean = 0, sd = 1)
y <- x + 4.2 * z + rnorm(n, mean = 0, sd = 1)

data <- data.frame(z = z, x = x, y = y)

no_control <- lm(y ~ x, data = data)
using_control <- lm(y ~ x + z, data = data)

print(summary(no_control))
print(summary(using_control))

# Real data the model can fit: If we study the relationship between police intervention (X) and crime rates (Y), economic conditions (Z) can be an important control variable. Wealthier neighborhoods may have lower crime rates and more resources for police, so controlling for economic conditions helps clarify the relationship between police intervention and crime rates.

# Model 8

sprinkler <- dagitty("dag {
    x -> y
    z -> y
}")
plot(sprinkler)

set.seed(1313)
n <- 1000
x <- rnorm(n, mean = 0, sd = 1)
z <- rnorm(n, mean = 0, sd = 1)
y <- 0.7 * x + rnorm(n, mean = 0, sd = 1)

data <- data.frame(x = x, z = z, y = y)

no_control <- lm(y ~ x, data = data)
using_control <- lm(y ~ x + z, data = data)

print(summary(no_control))
print(summary(using_control))

# Real data the model can fit: For studies in the education sector, we can explre the relation between class size (X) and academic performance (Y), including aneutral variable for socioeconomic status (Z). Including Z allows us to improve the model estimation and its precision. When including Z, the effect of X on Y dims, which could indicate that there may be aditional variables that could explain academic performance additional to class size.

# Model 11

sprinkler <- dagitty("dag {
    x -> z
    z -> y
}")
plot(sprinkler)

set.seed(6969)

n <- 1000            
x <- rnorm(n, mean = 0, sd = 1)
z <- 0.69*x + rnorm(n, mean = 0, sd = 1)
y <- 6.9*z + rnorm(n, mean = 0, sd = 1)

data <- data.frame(z = z, x = x, y = y)

no_control <- lm(y ~ x, data = data)
using_control <- lm(y ~ x + z, data = data)

print(summary(no_control))
print(summary(using_control))

# Real data the model can fit: We study the effect of participation in youth intervention programs (X) on the rate of criminal recidivism (Y), we could consider family stability (Z) as a mediator between youth intervention and recidivism. Controlling for family stability would block the full effect of the youth intervention on recidivism, biasing our estimates.

# Model 12 

sprinkler <- dagitty("dag {
    x -> m
    m -> z
    m -> y
}")
plot(sprinkler)

set.seed(666)   

n <- 1000  
x <- rnorm(n, mean = 0, sd = 1) 
m <- 6 * x + rnorm(n, mean = 0, sd = 1)
z <- 0.6 * m + rnorm(n, mean = 0, sd = 1)
y <- 0.66 * m + rnorm(n, mean = 0, sd = 1)

data <- data.frame(m = m, z = z, x = x, y = y)

no_control <- lm(y ~ x, data = data)
using_control <- lm(y ~ x + z, data = data)

print(summary(no_control))
print(summary(using_control))

# Real data the model can fit: In a study on the effect of job training (X) on the salary of workers (Y), we could consider the level of education of workers (Z) as a variable that is related to both job training and salary. Controlling for the level of education could bias our estimates, since it is equivalent to partially controlling the mediating effect of experience acquired during job training on workers' wages.

# Model 13

sprinkler <- dagitty("dag {
    x -> m
    z -> m
    m -> y
    x -> y
}")
plot(sprinkler)

set.seed(123)
n <- 1000
x <- rnorm(n, mean = 0, sd = 1)
m <- 3*x + rnorm(n, mean = 0, sd = 1)
z <- rnorm(n, mean = 0, sd = 1)
y <- 0.4*m + 0.6*x + rnorm(n, mean = 0, sd = 1)

data <- data.frame(x = x, m = m, z = z, y = y)

no_control <- lm(y ~ x, data = data)
using_control <- lm(y ~ x + z, data = data)

print(summary(no_control))
print(summary(using_control))

# Real data the model can fit: When fitting the model into real data, we can use it in the health sector. For chil nutrition, we define X as the time mothers breastfed their kids, M as the nutrion intake, Z as the socioeconomic status and, finally, Y would be child growth. Here, we can confirm that X affects Y through M. We can see that, when controlling for Z, the results in the regression don't really change, however, it does help the model to be more precise.


ERROR: Error in library(dagitty): there is no package called 'dagitty'
