In [None]:
library(haven)
options(scipen = 5)

In [None]:
#load gpa dataset 
gpadata<-read_dta("gpa2.dta")
head(gpadata)

In [None]:
#regress gpa on sat score, class percentile, and high school class size and class size squared
reg1<-lm(colgpa~sat+ hsperc + hsize + hsizesq, data= gpadata)
summary(reg1)

In [None]:
# Predicted value: sum of coefficients times values
summary(reg1)$coef
regco=summary(reg1)$coef
predgpa1 = regco[1,1] + 1200*regco[2,1] +regco[3,1]*30 +regco[4,1]*5+ regco[5,1]*25
#print
paste0("The predicted college gpa for someone with a SAT of 1200,",
       " hsperc = 30, and hsize = 5 is ",round(predgpa1,3))

In [None]:
# An easier way to do this with a dot product of two vectors:
predgpa2 = c(1,1200,30,5,25) %*% summary(reg1)$coef[,1]
as.numeric(round(predgpa2,3))

In [None]:
#transform data to get point estimate of prediction and SE
gpadata$sat0<- gpadata$sat-1200
gpadata$hsperc0 <-gpadata$hsperc-30
gpadata$hsize0<-gpadata$hsize-5
gpadata$hsizesq0<-gpadata$hsizesq-25

reg2<-lm(colgpa~sat0+hsperc0 + hsize0 + hsizesq0, data = gpadata)
summary(reg2)

In [None]:
# Is the intercept the same as our original estimate?
round(summary(reg2)$coef[1,1],3)==round(predgpa1,3)
# What is the confidence interval?
paste0("A 95% confidence interval around the predicted GPA is [",
      round(summary(reg2)$coef[1,1]-1.96*summary(reg2)$coef[1,2],3),",",
      round(summary(reg2)$coef[1,1]+1.96*summary(reg2)$coef[1,2],3),"].")
paste0("The range is ",round(2*1.96*summary(reg2)$coef[1,2],3))

In [None]:
# How do we interpret this?

In [None]:
# Does the choice of parameters for our prediction affect prediction SE?
# Suppose we wanted to predict outcomes for someone from one of the biggest schools
summary(gpadata$hsize)
# Will the SE for the predicted value be the same, smaller, or larger?

In [None]:
#transform data to get point estimate of prediction and SE
#same values except hsize
gpadata$sat0<- gpadata$sat-1200 # same
gpadata$hsperc0 <-gpadata$hsperc-30 # same
gpadata$hsize0<-gpadata$hsize-9 # different
gpadata$hsizesq0<-gpadata$hsizesq-81 # different
reg3<-lm(colgpa~sat0+hsperc0 + hsize0 + hsizesq0, data = gpadata)
summary(reg3)

In [None]:
# What is the confidence interval?
paste0("A 95% confidence interval around the predicted GPA is [",
      round(summary(reg3)$coef[1,1]-1.96*summary(reg3)$coef[1,2],3),",",
      round(summary(reg3)$coef[1,1]+1.96*summary(reg3)$coef[1,2],3),"].")
paste0("The range is ",round(2*1.96*summary(reg3)$coef[1,2],3))

In [None]:
# Why is this estimate less precise?
# Where should the predicted value SE be smallest?

In [None]:
# Use median values for each variable
gpadata$sat0<- gpadata$sat-median(gpadata$sat)
gpadata$hsperc0 <-gpadata$hsperc-median(gpadata$hsperc)
gpadata$hsize0<-gpadata$hsize-median(gpadata$hsize)
gpadata$hsizesq0<-gpadata$hsizesq-median(gpadata$hsize)^2
reg4<-lm(colgpa~sat0+hsperc0 + hsize0 + hsizesq0, data = gpadata)
summary(reg4)

In [None]:
# Back to lecture

In [None]:
# Confidence interval for a new observation
# Use same approach as before to get the point estimate and prediction error
summary(reg2)

In [None]:
#Retrieve estimate of sigma_u^2-hat
summary(reg2)$sigma^2
#Calculate SE(u-hat)
uhat_se=sqrt(summary(reg2)$coef[1,2]^2+summary(reg2)$sigma^2)
uhat_se
#Calculate CI
paste0("A 95% confidence interval around the predicted GPA is [",
      round(summary(reg2)$coef[1,1]-1.96*uhat_se,3),",",
      round(summary(reg2)$coef[1,1]+1.96*uhat_se,3),"].")
paste0("The range is ",round(2*1.96*uhat_se,3))

In [None]:
# Back to lecture

In [None]:
# Load wage data
wagedata<- read_dta("WAGE1.DTA")

In [None]:
# Regression with female binary variable
reg4<-lm(wage~female + educ + exper + tenure, data = wagedata)
summary(reg4)

In [None]:
# wage is average hourly earnings in $, for this sample from 1979
# How to interpret intercept? Coefficient on female?

In [None]:
# Back to lecture

In [None]:
# create edcat categorical variable
wagedata$edcat <- cut(wagedata$educ, breaks=c(0,8,11,15,18),
                     labels=c('No primary','Compl. prim.','Compl. sec.','Compl. post-sec.'))
wagedata[1:10,c("educ","edcat")]
class(wagedata$edcat)

In [None]:
# create individual edcat dummies
wagedata$noprim=as.numeric(wagedata$edcat=="No primary")
wagedata$prim=as.numeric(wagedata$edcat=="Compl. prim.")
wagedata$sec=as.numeric(wagedata$edcat=="Compl. sec.")
wagedata$postsec=as.numeric(wagedata$edcat=="Compl. post-sec.")
wagedata[1:10,c("educ","edcat","noprim","prim","sec","postsec")]

In [None]:
# Run regression
reg5<-lm(wage~prim+sec+postsec+female + exper + tenure, data = wagedata)
summary(reg5)

In [None]:
# How to interpret coefficients?
# What if I change my reference category
reg6<-lm(wage~noprim+sec+postsec+female + exper + tenure, data = wagedata)
summary(reg6)

In [None]:
# Direct way to run regression with categorical (factor) data in R
reg7<-lm(wage~as.factor(edcat)+female + exper + tenure, data = wagedata)
summary(reg7)

In [None]:
# Back to lecture

In [None]:
# Two binary variables interacting
# 1) interaction model
reg8<-lm(wage~educ+exper + tenure+ married*female, data = wagedata)
summary(reg8)

In [None]:
# 2) combined categorical model
wagedata$marrmale=as.numeric(wagedata$female==0 & wagedata$married==1)
wagedata$singmale=as.numeric(wagedata$female==0 & wagedata$married==0)
wagedata$marrfem=as.numeric(wagedata$female==1 & wagedata$married==1)
wagedata$singfem=as.numeric(wagedata$female==1 & wagedata$married==0)
reg9<-lm(wage~educ+exper + tenure+ marrmale +singfem+marrfem, data = wagedata)
summary(reg9)

In [None]:
sum(summary(reg8)$coef[5:7,1])
round(sum(summary(reg8)$coef[5:7,1]),3)==round(summary(reg9)$coef[7,1],3)

In [None]:
# Back to lecture

In [None]:
# Hypothesis tests
library(car)
# Does sex matter?
linearHypothesis(reg8, c("female = 0","married+married:female=0"))
linearHypothesis(reg9, c("singfem = 0","marrfem=0"))
# Does marital status matter for women?
linearHypothesis(reg8, "married:female+married=0")
linearHypothesis(reg9, "singfem = marrfem")

In [None]:
# Other ways to test if marital status matters for women
# 1) rewrite the model with single women as reference category
reg10<-lm(wage~educ+exper + tenure+ marrmale +singmale+marrfem, data = wagedata)
summary(reg10)
# 2) rewrite the model plugging in the linear combination
reg11<-lm(wage~educ+exper + tenure+ marrmale +female+marrfem, data = wagedata)
summary(reg11)