In [None]:
library(haven)
options(scipen = 5)

In [None]:
# Data on monthly wages
wagedata2<-read_dta("WAGE2.DTA")
head(wagedata2)

In [None]:
# Measurement error in the dependent variable
# True model
reg10<- lm(wage ~ age+ educ + exper, data = wagedata2)
summary(reg10)

In [None]:
# Monthly wage distribution
summary(wagedata2$wage)

In [None]:
# Suppose we measure wages with classical random error of -250 to +500
wagedata2$wage2=wagedata2$wage+runif(nrow(wagedata2),-250,500)
summary(wagedata2$wage2)
# Estimates with classical error in wage
reg11<- lm(wage2 ~age+  educ + exper , data = wagedata2)
summary(reg11)

In [None]:
# Let's run some simulations with classical measurement error
library(ggplot2)

In [None]:
estimates <- data.frame(matrix(NA,    # Create empty data frame
                          nrow = 1000,
                          ncol = 1)) 
estimates$coef<-0
estimates$tstat<-0
set.seed(1000)
for(i in 1:1000) {
    wagedata2$error=runif(nrow(wagedata2), -250,500)
    wagedata2$wage2=wagedata2$wage+wagedata2$error
    rege<-lm(wage2 ~age+  educ + exper , data = wagedata2)
    estimates[i,'coef']<-summary(rege)$coefficients[3,1] 
    estimates[i,'tstat']<-summary(rege)$coefficients[3,3] 
}

In [None]:
# Distribution of beta_2 relative to truth
truth<-summary(reg10)$coefficients[3,1]
p <- ggplot(estimates, aes(x=coef)) + 
    geom_density(fill = "blue", alpha = .5) +
    geom_vline( xintercept = truth)
p
mean(estimates$coef)
truth

In [None]:
# Distribution of t for beta_2 relative to truth
truth<-summary(reg10)$coefficients[3,3]
p <- ggplot(estimates, aes(x=tstat)) + 
    geom_density(fill = "blue", alpha = .5) +
    geom_vline( xintercept = truth)
p
mean(estimates$tstat)
truth

In [None]:
# Suppose we measure wages with non-classical error, correlated with educ
wagedata2$wage3=wagedata2$wage+runif(nrow(wagedata2),-50,50)*wagedata2$educ*2
summary(wagedata2$wage3)
reg12<- lm(wage3 ~age+  educ + exper , data = wagedata2)
summary(reg12)
summary(reg10)

In [None]:
# Suppose we measure wages with non-classical error, correlated with age
wagedata2$wage3=wagedata2$wage+runif(nrow(wagedata2),-100,100)*wagedata2$age/2
summary(wagedata2$wage3)
reg13<- lm(wage3 ~age+  educ + exper , data = wagedata2)
summary(reg13)
summary(reg10)

In [None]:
# Dataset on labor supply in Kenya in 2020 during Covid
kenya<-read_dta("kenya_labor_covid.dta")
head(kenya)

In [None]:
# Classical error in x
kenya$current_5_17=kenya$current_num_5_17+runif(nrow(kenya),0,1)

In [None]:
reg1<- lm(workinghrs_l7~ age + gender + ishead + marital +
          current_num_adults+current_num_5_17 +current_num_0_4, data = kenya)
summary(reg1)
reg2<- lm(workinghrs_l7~ age + gender + ishead + marital +
          current_num_adults+current_5_17 +current_num_0_4, data = kenya)
summary(reg2)

In [None]:
# Bigger error in x
kenya$current_5_17=kenya$current_num_5_17+runif(nrow(kenya),0,4)
reg3<- lm(workinghrs_l7~ age + gender + ishead + marital +
          current_num_adults+current_5_17 +current_num_0_4, data = kenya)
summary(reg3)

In [None]:
# Back to lecture

In [None]:
# RCT
jobdata<-read_dta("jip_dataset_forclass.dta")
head(jobdata)

In [None]:
# Make sure categorical variables are recognized as such
jobdata$geo_stata_zone <-as.factor(jobdata$geo_stata_zone) 
jobdata$trade_strata<- as.factor(jobdata$trade_strata)
jobdata$caste<-as.factor(jobdata$caste)

In [None]:
#run simple regression on endline survey data
reg1<- lm(emp~access + priority, data = jobdata, subset = survey_round==3)
summary(reg1)

In [None]:
# Interpret intercept and coefficient on access

In [None]:
#add controls
reg2<- lm(emp~access + priority + geo_stata_zone + trade_strata + 
          a_sex + age + educ + caste, data = jobdata, subset = survey_round==3)
summary(reg2)

In [None]:
# Back to lecture