# Two Sample t-test

In [140]:
library(dplyr)
library(ggplot2)

setwd('/Users/chaitanyaanand/Code/Learning/RWorkingDir/')

### Important Formulas

Degrees of freedom for the equal variance case<br>
$
\begin{align}
df = n_1 + n_2 - 2 \\
\end{align}
$<br>

Degrees of freedom for the unequal variance case<br>
$
\begin{align}
df = \frac{\left(\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}\right)^2}{\frac{\left(\frac{s_1^2}{n_1}\right)^2}{n_1-1} + \frac{\left(\frac{s_2^2}{n_2}\right)^2}{n_2-1}}\\
\end{align}
$<br>

Standard deviation for the equal variance case<br>
$
\begin{align}
s_{\bar{X_1}-\bar{X_2}}=\sqrt{\frac{(n_1-1)s_1^2-(n_2-2)s_2^2}{n_1+n_2-2}}\\
\end{align}
$<br>

Standard deviation for the unequal variance case<br>
$
\begin{align}
s_{\bar{X_1}-\bar{X_2}}=\sqrt{s_1^2+s_2^2}
\end{align}
$

Standard error for the equal variance case<br>
$
\begin{align}
SE_{\bar{X_1}-\bar{X_2}}=\sqrt{\left(\frac{(n_1-1)s_1^2-(n_2-2)s_2^2}{n_1+n_2-2}\right)\left(\frac{1}{n_1}+\frac{1}{n_2}\right)}\\
\end{align}
$<br>

Standard error for the unequal variance case<br>
$
\begin{align}
SE_{\bar{X_1}-\bar{X_2}}=\sqrt{\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}}
\end{align}
$

### Data Generation Process

In [280]:
# Parameters of the experiment
n1 = 100
n2 = 100
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 12
sigma1 = 3
sigma2 = 2

In [281]:
# Let us say we have some data from two random processes

treatment1_data <- rnorm(n = n1, mean = mu1, sd = sigma1)
treatment2_data <- rnorm(n = n2, mean = mu2, sd = sigma2)

In [282]:
# Parameters of the population distribution to be treated as unknown throughout the hypothesis testing process
rm(mu1, mu2, sigma1, sigma2)

### Calculating Sample Statistics

In [283]:
glimpse(treatment1_data)

 num [1:100] 7.86 10.91 6.4 14.19 15.7 ...


In [284]:
glimpse(treatment2_data)

 num [1:100] 13 11.43 9.79 10.16 15.2 ...


In [285]:

x1_bar = mean(treatment1_data)
x2_bar = mean(treatment2_data)

s1 = sd(treatment1_data)
s2 = sd(treatment2_data)

sd = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2)))^(1/2)
se = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2))*((1/n1) + (1/n2)))^(1/2)
df = n1 + n2 - 1
observed_dfference = x1_bar - x2_bar
z = (0 - observed_dfference)/se

In [286]:
x1_bar
x2_bar
x1_bar-x2_bar

In [287]:
s1
s2

In [288]:
se
z

### Step by Steb Procedure

H0: Treatment 1 and Treatment 2 are not diferent i.e. mu1 = mu2 or mu1 - mu2 = 0 <br>
H1: Treatment 1 and Treatment 2 are the same i.e. mu1 != mu2 or mu1 - mu2 != 0 <br>

In [289]:
p = pt(q = z, df = df, lower.tail = TRUE, log.p = FALSE)
p = ifelse(p > 0.5, (1-p), p)*2
p

In [290]:
ifelse(p < alpha, "We reject the null hypothesis", "We fail to reject the null hypothesis")

### Alternative Approaoch - Confidence Interval

In [356]:
critical_value = qt(p = alpha/2, df = df, lower.tail = TRUE, log.p = FALSE)
lower_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                 observed_dfference - (critical_value*se),
                 observed_dfference + (critical_value*se))
upper_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                 observed_dfference + (critical_value*se),
                 observed_dfference - (critical_value*se))
ci = c(lower_limit, observed_dfference, upper_limit)

In [359]:
ci

In [317]:
ifelse(0 >= ci[1] & 0 <= ci[3],
       "The confidence interval contains zero",
       "The confidence interval (corresponding to chosen alpha) does not contain 0")

### Encapsulating the above process in a function (to enable iterative experimentation)

In [588]:
two_sample_t <- function(n1, n2, alpha, mu1, mu2, sigma1, sigma2) {
    # Generating data
    treatment1_data <- rnorm(n = n1, mean = mu1, sd = sigma1)
    treatment2_data <- rnorm(n = n2, mean = mu2, sd = sigma2)
    
    # Calculating sample statistics
    x1_bar = mean(treatment1_data)
    x2_bar = mean(treatment2_data)
    
    s1 = sd(treatment1_data)
    s2 = sd(treatment2_data)
    
    sd = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2)))^(1/2)
    se = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2))*((1/n1) + (1/n2)))^(1/2)
    df = n1 + n2 - 1
    observed_dfference = x1_bar - x2_bar
    z = (0 - (observed_dfference))/se
    
    # Calculating p-value
    p = pt(q = z, df = df, lower.tail = TRUE, log.p = FALSE)
#     p = pnorm(q = z, mean = 0, sd = sd, lower.tail = TRUE, log.p = FALSE)
    p = ifelse(p > 0.5, (1-p), p)*2
    
    # Calculating confidence interval
    critical_value = qt(p = alpha/2, df = df, lower.tail = TRUE, log.p = FALSE)
#     critical_value = qnorm(p = alpha/2, mean = observed_dfference, sd = sd, lower.tail = TRUE, log.p = FALSE)
    lower_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                     observed_dfference - (critical_value*se),
                     observed_dfference + (critical_value*se))
    upper_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                     observed_dfference + (critical_value*se),
                     observed_dfference - (critical_value*se))
    ci = c(lower_limit, observed_dfference, upper_limit)
    
    # Results of the hypothesis test
    null_hypothesis_test_method <- ifelse(p <= alpha, "We reject the null hypothesis", "We fail to reject the null hypothesis")
    confidence_interval_method <- ifelse(ci[1] <= 0 & 0 <= ci[3],
       "The confidence interval contains zero",
       "The confidence interval (corresponding to chosen alpha) does not contain 0")
    
    ret <- list(ifelse(null_hypothesis_test_method == "We reject the null hypothesis" &
                  confidence_interval_method == "The confidence interval (corresponding to chosen alpha) does not contain 0", "Reject",
                  
                  ifelse(null_hypothesis_test_method == "We fail to reject the null hypothesis" &
                         confidence_interval_method == "The confidence interval contains zero", "Fail to reject",
                         "ERROR")),
                p = p,
                ci = ci,
                x1_bar = x1_bar,
                x2_bar = x2_bar,
                s1 = s1,
                s2 = s2,
                se = se)
    
    return(ret)
}

In [592]:
# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 12
mu2 = 11
sigma1 = 3
sigma2 = 3

two_sample_t(n1 = n1,
             n2 = n2,
             alpha = alpha,
             mu1 = mu1,
             mu2 = mu2,
             sigma1 = sigma1,
             sigma2 = sigma2)

In [596]:
# Iterate

# Parameters of the experiment
n1 = 1000
n2 = 1000
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
    ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

[[1]]
[1] "Reject"

$p
[1] 0.02074681

$ci
[1] 0.04734167 0.31017281 0.57300394

$x1_bar
[1] 11.26788

$x2_bar
[1] 10.95771

$s1
[1] 3.047416

$s2
[1] 2.945215

$se
[1] 0.1340188



results
Fail to reject         Reject 
            99              1 

In [545]:
unique(d)

In [595]:
# Iterate

# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
    ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

[[1]]
[1] "Reject"

$p
[1] 0.01090106

$ci
[1] 0.002487168 0.010807409 0.019127650

$x1_bar
[1] 11.00524

$x2_bar
[1] 10.99443

$s1
[1] 3.000715

$s2
[1] 3.002757

$se
[1] 0.004245096

[[1]]
[1] "Reject"

$p
[1] 0.03115618

$ci
[1] 0.0008279042 0.0091446213 0.0174613384

$x1_bar
[1] 11.00581

$x2_bar
[1] 10.99667

$s1
[1] 3.000814

$s2
[1] 3.000116

$se
[1] 0.004243298



results
Fail to reject         Reject 
            98              2 

In [600]:
# Iterate

# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 12
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
#     ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

results
Reject 
   100 

In [607]:
# Iterate

# Parameters of the experiment
n1 = 100
n2 = 100
alpha = 0.05

# Population parameters

mu1 = 12
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
#     ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

results
Fail to reject         Reject 
            34             66 

In [599]:
# Iterate

# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
    ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

[[1]]
[1] "Reject"

$p
[1] 0.02054402

$ci
[1] 0.001512685 0.009833840 0.018154996

$x1_bar
[1] 11.00748

$x2_bar
[1] 10.99764

$s1
[1] 3.001501

$s2
[1] 3.002632

$se
[1] 0.004245563

[[1]]
[1] "Reject"

$p
[1] 0.01428403

$ci
[1] -0.018713623 -0.010396619 -0.002079615

$x1_bar
[1] 10.99783

$x2_bar
[1] 11.00822

$s1
[1] 3.00171

$s2
[1] 2.999427

$se
[1] 0.004243445

[[1]]
[1] "Reject"

$p
[1] 0.03259038

$ci
[1] 0.0007516223 0.0090684211 0.0173852199

$x1_bar
[1] 11.0017

$x2_bar
[1] 10.99263

$s1
[1] 3.001089

$s2
[1] 2.9999

$se
[1] 0.00424334



results
Fail to reject         Reject 
            97              3 