In [None]:
library(haven)
options(scipen = 5)

In [None]:
# Dataset on labor supply in Kenya in 2020 during Covid
kenya<-read_dta("kenya_labor_covid.dta")
head(kenya)

In [None]:
reg1<- lm(employed~ age + gender + ishead + marital +
          current_num_adults+current_num_5_17 +current_num_0_4, data = kenya)
summary(reg1)

In [None]:
# How to interpret the intercept?
# How to interpret the coefficient on gender (dummy where female=1)?

In [None]:
# Predicted value for 30 year old woman with some characteristics
predempl = c(1,30,1,0,1,4,5,0) %*% summary(reg1)$coef[,1]
as.numeric(round(predempl,3))
# And for a 60 year old
predempl2 = c(1,60,1,0,1,4,5,0) %*% summary(reg1)$coef[,1]
as.numeric(round(predempl2,3))

In [None]:
# Any issue here?
# Not specific to this particular model
# Back to lecture

In [None]:
# Logit model with binary dep var
reg2<- glm(employed~ gender, data = kenya, family = "binomial")
summary(reg2)

In [None]:
# How to interpret the coefficient for gender?
mean(kenya[kenya$gender==1,]$employed)
mean(kenya[kenya$gender==0,]$employed)
orf=mean(kenya[kenya$gender==1,]$employed)/(1-mean(kenya[kenya$gender==1,]$employed))
orm=mean(kenya[kenya$gender==0,]$employed)/(1-mean(kenya[kenya$gender==0,]$employed))
orf/orm
log(orf/orm)

In [None]:
# Converting log odds
logodds=round(summary(reg2)$coef[2,1],3) # log odds
oddsratio=round(exp(logodds),3) # odds
logodds
oddsratio

In [None]:
# Logit model with continuous dep var
reg3<- glm(employed~ gender+age, data = kenya, family = "binomial")
summary(reg3)

In [None]:
# How to interpret coefficient on age?
round(exp(summary(reg3)$coef[3,1]),3)
# Back to lecture

In [None]:
# Proxy variables
# Load data on monthly wages
wagedata2<-read_dta("WAGE2.DTA")
head(wagedata2)

In [None]:
# Suppose unobserved ability=delta_0 + \delta_1 IQ + v
wagedata2$ability=-10+0.1*wagedata2$IQ+runif(nrow(wagedata2))
summary(wagedata2$ability)

In [None]:
#biased model
reg4<-lm(lwage ~ educ + exper, data = wagedata2)
summary(reg4)
#true model
reg5<-lm(lwage ~ educ + exper + ability, data = wagedata2)
summary(reg5)

In [None]:
# How is the coefficient on educ biased? 
# What does that imply about the relationship between educ and ability?

In [None]:
# Regression with proxy variable
reg6<- lm(lwage ~ educ + exper + IQ, data = wagedata2)
summary(reg6)

In [None]:
# What if IQ had been less of a good proxy?
# Suppose unobserved ability=delta_0 + \delta_1 IQ + \delta_2 educ + v_2
wagedata2$ability2=-11+0.1*wagedata2$IQ+0.1*wagedata2$educ+ runif(nrow(wagedata2))
summary(wagedata2$ability2)

In [None]:
# True model
reg7<- lm(lwage ~ educ + exper + ability2, data = wagedata2)
summary(reg7)
# Regression with proxy variable
summary(reg6)

In [None]:
# Including the proxy helps, but not as much

In [None]:
# What if IQ is a pretty bad proxy?
wagedata2$ability3=-8+0.01*wagedata2$IQ+0.25*wagedata2$educ+ runif(nrow(wagedata2))*7
summary(wagedata2$ability3)
# True model
reg9<- lm(lwage ~ educ + exper + ability3, data = wagedata2)
summary(reg9)
# Regression with proxy variable
summary(reg6)
# Original biased regression
summary(reg4)

In [None]:
# We actually increase our bias with this bad proxy
# Back to lecture

In [None]:
# Measurement error in the dependent variable
# True model
reg10<- lm(wage ~ age+ educ + exper, data = wagedata2)
summary(reg10)

In [None]:
# Monthly wage distribution
summary(wagedata2$wage)

In [None]:
# Suppose we measure wages with classical random error of -250 to +500
wagedata2$wage2=wagedata2$wage+runif(nrow(wagedata2),-250,500)
summary(wagedata2$wage2)
# Estimates with classical error in wage
reg11<- lm(wage2 ~age+  educ + exper , data = wagedata2)
summary(reg11)

In [None]:
# Let's run some simulations with classical measurement error
library(ggplot2)

In [None]:
estimates <- data.frame(matrix(NA,    # Create empty data frame
                          nrow = 1000,
                          ncol = 1)) 
estimates$coef<-0
estimates$tstat<-0
set.seed(1000)
for(i in 1:1000) {
    wagedata2$error=runif(nrow(wagedata2), -250,500)
    wagedata2$wage2=wagedata2$wage+wagedata2$error
    rege<-lm(wage2 ~age+  educ + exper , data = wagedata2)
    estimates[i,'coef']<-summary(rege)$coefficients[3,1] 
    estimates[i,'tstat']<-summary(rege)$coefficients[3,3] 
}

In [None]:
# Distribution of beta_2 relative to truth
truth<-summary(reg10)$coefficients[3,1]
p <- ggplot(estimates, aes(x=coef)) + 
    geom_density(fill = "blue", alpha = .5) +
    geom_vline( xintercept = truth)
p
mean(estimates$coef)
truth

In [None]:
# Distribution of t for beta_2 relative to truth
truth<-summary(reg10)$coefficients[3,3]
p <- ggplot(estimates, aes(x=tstat)) + 
    geom_density(fill = "blue", alpha = .5) +
    geom_vline( xintercept = truth)
p
mean(estimates$tstat)
truth

In [None]:
# Suppose we measure wages with non-classical error, correlated with educ
wagedata2$wage3=wagedata2$wage+runif(nrow(wagedata2),-500,500)/wagedata2$educ*4
summary(wagedata2$wage3)
reg12<- lm(wage3 ~age+  educ + exper , data = wagedata2)
summary(reg12)
summary(reg10)

In [None]:
# Suppose we measure wages with non-classical error, correlated with age
wagedata2$wage3=wagedata2$wage+runif(nrow(wagedata2),-100,100)*wagedata2$age/2
summary(wagedata2$wage3)
reg13<- lm(wage3 ~age+  educ + exper , data = wagedata2)
summary(reg13)
summary(reg10)