## A.1 Import libraries

In [2]:
#install.packages('dplyr')
library(dplyr)

## A.2 Import data

In [4]:
# Github file url
url = 'https://raw.githubusercontent.com/corybaird/PLCY_610_public/master/Discussion_sections/Disc7_PS4/ps4data.csv'
# Download data into df 
df = read.csv(url)

In [5]:
df %>% head(3)

abd,age,fthr_ed,educ,log.wage
1,21,7,6,8.517393
1,29,7,3,6.370472
1,23,7,4,7.670362


# 1. Standard error and confidence intervals
- `PS4 corresponding problem 1.a`


$$\bar{x}\pm t \cdot \frac{s}{\sqrt{n}}$$

## 1.1 Calculation: by hand

### 1.1.1 Standard error

$$ \frac{s}{\sqrt{n}}$$

In [9]:
# standard deviation
sd = sd(df$age)

# n (sample size)
n = length(df$age)

standard_error = sd/(sqrt(n))
standard_error

### 1.1.2 t-score

In [11]:
# t-score with a 95% confidence level
tscore = qt(0.975, df=length(df$edu)-1) #note df= degrees of freedom not dataframe!

# x-bar (mean)
x_bar = mean(df$age)

### 1.1.3 Confidence intervals

In [12]:
lower_confidence = x_bar - tscore*standard_error
lower_confidence

In [13]:
upper_confidence = x_bar + tscore*standard_error
upper_confidence

## 1.2 Calculation: by function (OPTIONAL)

In [38]:
answer_function = function(column){
   
    sd = sd(column)
    n = length(column)
    standard_error = sd/(sqrt(n))
    tscore = qt(0.975, df=length(column)-1) #note df= degrees of freedom not dataframe!
    solu = tscore*standard_error

    x_mean = mean(column)

    conf_lower = round(x_mean-solu,6)
    conf_upper = round(x_mean+solu,6)
    
    print(paste('Mean:', x_mean,'+-', round(tscore,3),'*',round(standard_error,3), sep=''))
    print(paste('Upper bound: ', conf_upper, ' Lower bound: ', conf_lower, sep=''))
}

In [40]:
answer_function(df$age)

[1] "Mean:20.9082321187584+-1.963*0.184"
[1] "Upper bound: 21.269234 Lower bound: 20.54723"


# 2. t-tests

## 2.1 Two sided t-test: ONE MEAN
- `PS4 corresponding problem 1.b`

$$
\text{Null Hypothesis}--H_0: 30\\
\text{Alt Hypothesis}--H_a: \neq 30
$$

In [21]:
# Is the mean for age statistically different than 30?
age = 30 

t.test(df$age, mu = age)


	One Sample t-test

data:  df$age
t = -49.442, df = 740, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 30
95 percent confidence interval:
 20.54723 21.26923
sample estimates:
mean of x 
 20.90823 


## 2.2 Two sided t-test: ONE MEAN
- `PS4 corresponding problem 1.c`

$$
\text{Null Hypothesis}--H_0: 22\\
\text{Alt Hypothesis}--H_a: \neq 22
$$

In [22]:
# Is the mean for age statistically different than 30?
age = 22

t.test(df$age, mu = age)


	One Sample t-test

data:  df$age
t = -5.9372, df = 740, p-value = 4.457e-09
alternative hypothesis: true mean is not equal to 22
95 percent confidence interval:
 20.54723 21.26923
sample estimates:
mean of x 
 20.90823 


## 2.3 Two-side t-test: TWO MEANS

- `PS4 corresponding problem 1.d`


### 2.3.1 Seperate two groups (filter data)
- We did seperated our data frame into two groups (data frames) in the 3rd video under number 3. Link [here](https://github.com/corybaird/PLCY_610_public/blob/master/Discussion_sections/Disc3_PS2/Disc3_PS2.ipynb)

In [23]:
df_abducted = df %>% filter(abd==1)

df_abducted %>% head(2)

abd,age,fthr_ed,educ,log.wage
1,21,7,6,8.517393
1,29,7,3,6.370472


In [24]:
df_non_abducted = df %>% filter(abd==0)

df_non_abducted %>% head(2)

abd,age,fthr_ed,educ,log.wage
0,29,7,7,7.404801
0,14,0,6,4.900324


### 2.3.2 Conduct t-test: Compare ages
- select data from each newly created df

$$
\text{Null Hypothesis}-- H_0: \quad \text{Mean age abducted} - \text{Mean age NONabducted} = 0\\
\text{Alt Hypothesis}-- H_a: \quad \text{Mean age abducted} - \text{Mean age NONabducted} \neq 0 
$$

In [25]:
t.test(df_abducted$age, df_non_abducted$age)


	Welch Two Sample t-test

data:  df_abducted$age and df_non_abducted$age
t = 3.239, df = 595.88, p-value = 0.001266
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.4783922 1.9521343
sample estimates:
mean of x mean of y 
 21.36580  20.15054 


#### 2.3.2.1 Conduct t-test: Compare ages (Alternative method)

In [33]:
t.test(df$age[df$abd==1],df$age[df$abd==0])


	Welch Two Sample t-test

data:  df$age[df$abd == 1] and df$age[df$abd == 0]
t = 3.239, df = 595.88, p-value = 0.001266
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.4783922 1.9521343
sample estimates:
mean of x mean of y 
 21.36580  20.15054 


## 2.3 one-side t-test: TWO MEANS
- `PS4 corresponding problem 1.f`

- Usually we conduct two-sided test for two means


### 2.3.1 Alternative hypothesis: less than 0
$$
\text{Null Hypothesis}-- H_0: \quad \text{Mean age abducted} - \text{Mean age NONabducted} < 0\\
\text{Alt Hypothesis}-- H_a: \quad \text{Mean age abducted} - \text{Mean age NONabducted} >0 
$$

In [34]:
t.test(df_abducted$educ, df_non_abducted$educ, alternative='greater')


	Welch Two Sample t-test

data:  df_abducted$educ and df_non_abducted$educ
t = -2.6798, df = 551.58, p-value = 0.9962
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 -0.9615124        Inf
sample estimates:
mean of x mean of y 
 6.820346  7.415771 


## 2.4 Two side t-test: TWO MEANS
- `PS4 corresponding problem 1.g`


In [35]:
t.test(df_abducted$fthr_ed, df_non_abducted$fthr_ed)


	Welch Two Sample t-test

data:  df_abducted$fthr_ed and df_non_abducted$fthr_ed
t = -1.1125, df = 572.99, p-value = 0.2664
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.8408032  0.2327410
sample estimates:
mean of x mean of y 
 5.764069  6.068100 
