In [None]:
library(foreign)
wagedata<- read.dta("WAGE1.DTA")

In [None]:
#Multicollinearity

#Examine data
head(wagedata,10)
#regress log wages on education and experience
Reg1 <- lm(lwage ~ educ + exper + female, data = wagedata)
summary(Reg1)

In [None]:
#What about multicollinear variables?
#define variable male = 1-female
wagedata[,"male"] <- 1-wagedata[,"female"]
#regress log wages on education, experience, male, and female
Reg2 <- lm(lwage ~ educ + exper + female + male, data = wagedata)
summary(Reg2)

In [None]:
#What about more complicated multicollinearity?
#we can back out age from these data assuming people start education at 6 and are never out of school or work
#define age in the data
wagedata$age<-wagedata$exper+wagedata$educ+6
#What happens if we run our regressions controlling for age, educ, and exper?
Reg3 <-lm(lwage ~ educ + exper + female + age, data = wagedata)
summary(Reg3)

In [None]:
#What if we allow between 0-5 years away from education or work?
# *Almost* perfect collinearity
wagedata$age2<-wagedata$exper+wagedata$educ+6+runif(nrow(wagedata))*5
Reg4 <-lm(lwage ~ educ + exper + female+ age2, data = wagedata)
summary(Reg4)
summary(Reg1)

In [None]:
# Back to lecture

In [None]:
# R^2 and Adjusted R^2

#Adding relevant variables
Reg5 <- lm(lwage ~ educ, data = wagedata)
Reg6 <- lm(lwage ~ educ + exper, data = wagedata)
Reg7 <- lm(lwage ~ educ + exper + female, data = wagedata)
summary(Reg5)
summary(Reg6)
summary(Reg7)

In [None]:
#Adding non-relevant variables
Reg8 <- lm(lwage ~ educ + exper + female + nonwhite, data = wagedata)
wagedata$noise<-runif(nrow(wagedata))
Reg9 <- lm(lwage ~ educ + exper + female + nonwhite + noise, data = wagedata)
summary(Reg7)
summary(Reg8)
summary(Reg9)

In [None]:
# Back to lecture

In [None]:
# MLR standard errors

Reg0 <- lm(lwage ~ educ, data = wagedata)
#What does the difference in SE(educ) when adding controls imply about the components of SE(educ)?
summary(Reg0)
summary(Reg1)

In [None]:
#What about adding a highly-correlated variable?
wagedata$age<-wagedata$exper+wagedata$educ+6+runif(nrow(wagedata))*5
Reg10 <-lm(lwage ~ educ + exper + female + age, data = wagedata)
#What does the difference in SE(educ) when adding this variable imply about how the components of SE(educ) changed?
summary(Reg1)
summary(Reg10)