In [None]:
# Confidence intervals recap

# Suppose we are interested in mean height for the population of 100 students in the class. 
# Let X be the random variable height, in inches.
# I generate here a simulated set of 100 student heights
set.seed(12345)
X=65+runif(100,min=-8,max=10) # Simulated heights are uniformly distributed from 57 to 75 inches
data=data.frame(X)
# Calculate the true population mean
mu=mean(data$X)
round(mu,3)

In [None]:
# Let's simulate drawing 1000 different samples of 25 students
n=25
# We will calculate a 95% confidence interval for each sample
# With n=25 and a t distribution, c_0.025=1.984
bar_xs<-numeric(1000) # we'll record the sample mean for each sample
inside<-numeric(1000) # we'll record whether mu is within the CI for each sample
for(s in 1:1000) {
    data$sample<-runif(100)<0.25
    bar_x=mean(data[data$sample==TRUE,"X"])
    sd_x=sd(data[data$sample==TRUE,"X"])
    bar_xs[s]=bar_x
    lci=round(bar_x-1.984*sd_x/sqrt(n),3)
    rci=round(bar_x+1.984*sd_x/sqrt(n),3)
    inside[s]=(mu>=lci & mu<=rci)
}

In [None]:
# Histogram of estimated sample means
hist(bar_xs,50)
abline(v=mean(bar_xs),col="red",lwd=2) # mean of observed sample means
abline(v=mu,col="blue",lwd=2) # true population mean

In [None]:
# For how many samples was the true population mean inside our 95% confidence interval?
# We would expect it to be very close to 95%
mean(inside)

In [None]:
# Back to lecture

In [None]:
# Hypothesis test
# H_0: mu_0>= 67 -> use 67 because if can reject one-sided test at 67, can reject for anything larger
mu_0=67
c=-1.711
# Draw sample and get estimates
data$sample<-runif(100)<0.25
bar_x=mean(data[data$sample==TRUE,"X"])
sd_x=sd(data[data$sample==TRUE,"X"])
# Calculate test statistic
t=(bar_x-mu_0)/(sd_x/sqrt(n))
round(t,3)
print(paste0("Is ",round(t,3), " less than ",c,"? ",t<c))

In [None]:
# What about another sample?
# Draw sample and get estimates
data$sample<-runif(100)<0.25
bar_x=mean(data[data$sample==TRUE,"X"])
sd_x=sd(data[data$sample==TRUE,"X"])
# Calculate test statistic
t=(bar_x-mu_0)/(sd_x/sqrt(n))
round(t,3)
print(paste0("Is ",round(t,3), " less than ",c,"? ",t<c))

In [None]:
# Simulated t tests for 1000 samples
t_s<-numeric(1000) # we'll record the test statistic for each sample
reject_s<-numeric(1000) # we'll record whether we reject the null at a 95% confidence level for each sample
for(s in 1:1000) {
    data$sample<-runif(100)<0.25
    bar_x=mean(data[data$sample==TRUE,"X"])
    sd_x=sd(data[data$sample==TRUE,"X"])
    t=round((bar_x-mu_0)/(sd_x/sqrt(n)),3)
    t_s[s]=t
    reject_s[s]=(t<c)
}

In [None]:
# What share of tests rejected the null?
summary(t_s)
mean(reject_s)
# But the null is false! The true mu is 66.169

In [None]:
# Back to lecture

In [None]:
# Hypothesis: mu=66
mu_0=66
c=2.064
# Simulated t tests for 1000 samples
t_s<-numeric(1000) # we'll record the test statistic for each sample
reject_s<-numeric(1000) # we'll record whether we reject the null at a 95% confidence level for each sample
for(s in 1:1000) {
    data$sample<-runif(100)<0.25
    bar_x=mean(data[data$sample==TRUE,"X"])
    sd_x=sd(data[data$sample==TRUE,"X"])
    t=round((bar_x-mu_0)/(sd_x/sqrt(n)),3)
    t_s[s]=abs(t)
    reject_s[s]=(t>c)
}

In [None]:
# What share of tests rejected the null?
summary(t_s)
mean(reject_s)

In [None]:
# Back to lecture

In [None]:
# load some needed packages and data
library(haven)
library(tidyverse)
yields <- read_dta("irr_yields.dta")

In [None]:
head(yields)

In [None]:
# find out means of log yield by irrigated status
#declare the dataset
means<-yields %>%
#tell it to sort by irrigation status
group_by(irr_loc) %>%
#ask it what the means are (dropping missing values)
summarize(meanyield = mean(ln_tot_yield, na.rm = T)) 
means

In [None]:
#same for standard deviations
sds<-yields %>%
group_by(irr_loc) %>%
summarize(sdyield = sd(ln_tot_yield, na.rm = T)) 
sds

In [None]:
#find out sample sizes
ns<-yields %>%
group_by(irr_loc) %>%
filter(is.na(ln_tot_yield == F)) %>%
count(ln_tot_yield)
ns

In [None]:
# calculate t
t<-(means[2,2]-means[1,2])/sqrt((sds[2,2]^2/ns[2,3] + sds[1,2]^2/ns[1,3]))
t