In [None]:
library(foreign)
wagedata<- read.dta("WAGE1.DTA")

In [None]:
# R^2 and Adjusted R^2

#Adding relevant variables
Reg1 <- lm(lwage ~ educ, data = wagedata)
Reg2 <- lm(lwage ~ educ + exper + female, data = wagedata)
summary(Reg1)
summary(Reg2)

In [None]:
#Adding non-relevant variable
Reg3 <- lm(lwage ~ educ + exper + female + nonwhite, data = wagedata)
summary(Reg2)
summary(Reg3)

In [None]:
# Back to lecture

In [None]:
# MLR standard errors
#What does the difference in SE(educ) when adding controls imply about the components of SE(educ)?
summary(Reg1)
summary(Reg2)

In [None]:
#What about adding a highly-correlated variable?
wagedata$age<-wagedata$exper+wagedata$educ+6+runif(nrow(wagedata))*5
Reg4 <-lm(lwage ~ educ + exper + female + age, data = wagedata)
#What does the difference in SE(educ) when adding this variable imply about how the components of SE(educ) changed?
summary(Reg2)
summary(Reg4)

In [None]:
# Back to lecture

In [None]:
# Confidence intervals simulation

# Suppose we are interested in mean height for the population of 100 students in the class. 
# Let X be the random variable height, in inches.
# I generate here a simulated set of 100 student heights
set.seed(12345)
X=65+runif(100,min=-8,max=10) # Simulated heights are uniformly distributed from 57 to 75 inches
# Calculate the true population mean
mu=mean(X)
round(mu,3)

In [None]:
# Suppose I draw a random sample of 25 students
data=data.frame(X)
data$sample<-runif(100)<0.25
head(data,10)

In [None]:
# Calculate the sample mean and sample SD
bar_x=mean(data[data$sample==TRUE,"X"])
sd_x=sd(data[data$sample==TRUE,"X"])
c(bar_x,sd_x)

In [None]:
# Suppose we want to estimate a 95\% confidence interval for the population mean of height in the class
# We have estimated sigma^2_x with the sample SD, so our normalized variable will have a t distribution
# We want the critical value for a 95\% confidence interval with n-1=99 degrees of freedom: 1.984
# We know Pr(-1.984<(bar_x-mu)/(sd_x/sqrt(n))<1.984)=0.95
# Rearranging gives Pr(bar_x-1.984*sd_x/sqrt(n)<mu<bar_x+1.984*sd_x/sqrt(n))=0.95
# This means that there is a 95\% probability that the population mean is within this interval for a given sample
# Substitute in with what we've calculated for this sample
n=25
lci=round(bar_x-1.984*sd_x/sqrt(n),3)
rci=round(bar_x+1.984*sd_x/sqrt(n),3)
print(paste0("The 95% confidence interval is [",lci,",",rci,"]."))
print(paste0("True mu of ",round(mu,3)," is in this interval? ",(mu>=lci & mu<=rci)))

In [None]:
# Let's simulate this process drawing 1000 different samples of 25 students
# We will calculate a 95% confidence interval for each sample
bar_xs<-numeric(1000)
inside<-numeric(1000)
for(s in 1:1000) {
    data$sample<-runif(100)<0.25
    bar_x=mean(data[data$sample==TRUE,"X"])
    sd_x=sd(data[data$sample==TRUE,"X"])
    bar_xs[s]=bar_x
    lci=round(bar_x-1.984*sd_x/sqrt(n),3)
    rci=round(bar_x+1.984*sd_x/sqrt(n),3)
    inside[s]=(mu>=lci & mu<=rci)
}

In [None]:
# Histogram of estimated sample means
hist(bar_xs,50)
abline(v=mean(bar_xs),col="red",lwd=2) # mean of observed sample means
abline(v=mu,col="blue",lwd=2) # true population mean
# It looks approximately normal and centered around mu, as suggested by the Central Limit Theorem
# If we increased our sample size, the shape would get even closer to normal

In [None]:
# For how many samples was the true population mean inside our 95% confidence interval?
# We would expect it to be very close to 95%
mean(inside)
# This illustrates what it means to estimate a 95% confidence interval: for 95% of samples, it will contain mu