# Two Sample t-test

In [0]:
library(dplyr)
library(ggplot2)

### Important Formulas

Degrees of freedom for the equal variance case<br>
$
\begin{align}
df = n_1 + n_2 - 2 \\
\end{align}
$<br>

Degrees of freedom for the unequal variance case<br>
$
\begin{align}
df = \frac{\left(\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}\right)^2}{\frac{\left(\frac{s_1^2}{n_1}\right)^2}{n_1-1} + \frac{\left(\frac{s_2^2}{n_2}\right)^2}{n_2-1}}\\
\end{align}
$<br>

Standard deviation for the equal variance case<br>
$
\begin{align}
s_{\bar{X_1}-\bar{X_2}}=\sqrt{\frac{(n_1-1)s_1^2-(n_2-2)s_2^2}{n_1+n_2-2}}\\
\end{align}
$<br>

Standard deviation for the unequal variance case<br>
$
\begin{align}
s_{\bar{X_1}-\bar{X_2}}=\sqrt{s_1^2+s_2^2}
\end{align}
$

Standard error for the equal variance case<br>
$
\begin{align}
SE_{\bar{X_1}-\bar{X_2}}=\sqrt{\left(\frac{(n_1-1)s_1^2-(n_2-2)s_2^2}{n_1+n_2-2}\right)\left(\frac{1}{n_1}+\frac{1}{n_2}\right)}\\
\end{align}
$<br>

Standard error for the unequal variance case<br>
$
\begin{align}
SE_{\bar{X_1}-\bar{X_2}}=\sqrt{\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}}
\end{align}
$

### Data Generation Process

In [0]:
# Parameters of the experiment
n1 = 100
n2 = 100
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 12
sigma1 = 3
sigma2 = 2

In [0]:
# Let us say we have some data from two random processes

treatment1_data <- rnorm(n = n1, mean = mu1, sd = sigma1)
treatment2_data <- rnorm(n = n2, mean = mu2, sd = sigma2)

In [0]:
# Parameters of the population distribution to be treated as unknown throughout the hypothesis testing process
rm(mu1, mu2, sigma1, sigma2)

### Calculating Sample Statistics

In [7]:
glimpse(treatment1_data)

 num [1:100] 14.69 10.85 8.87 11.19 7.71 ...


In [8]:
glimpse(treatment2_data)

 num [1:100] 10.5 14.4 14.2 12.7 12.6 ...


In [0]:

x1_bar = mean(treatment1_data)
x2_bar = mean(treatment2_data)

s1 = sd(treatment1_data)
s2 = sd(treatment2_data)

sd = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2)))^(1/2)
se = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2))*((1/n1) + (1/n2)))^(1/2)
df = n1 + n2 - 1
observed_dfference = x1_bar - x2_bar
z = (0 - observed_dfference)/se

In [10]:
x1_bar
x2_bar
x1_bar-x2_bar

In [11]:
s1
s2

In [12]:
se
z

### Step by Steb Procedure

H0: Treatment 1 and Treatment 2 are not diferent i.e. mu1 = mu2 or mu1 - mu2 = 0 <br>
H1: Treatment 1 and Treatment 2 are the same i.e. mu1 != mu2 or mu1 - mu2 != 0 <br>

In [13]:
p = pt(q = z, df = df, lower.tail = TRUE, log.p = FALSE)
p = ifelse(p > 0.5, (1-p), p)*2
p

In [14]:
ifelse(p < alpha, "We reject the null hypothesis", "We fail to reject the null hypothesis")

### Alternative Approaoch - Confidence Interval

In [0]:
critical_value = qt(p = alpha/2, df = df, lower.tail = TRUE, log.p = FALSE)
lower_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                 observed_dfference - (critical_value*se),
                 observed_dfference + (critical_value*se))
upper_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                 observed_dfference + (critical_value*se),
                 observed_dfference - (critical_value*se))
ci = c(lower_limit, observed_dfference, upper_limit)

In [16]:
ci

In [17]:
ifelse(0 >= ci[1] & 0 <= ci[3],
       "The confidence interval contains zero",
       "The confidence interval (corresponding to chosen alpha) does not contain 0")

### Encapsulating the above process in a function (to enable iterative experimentation)

In [0]:
two_sample_t <- function(n1, n2, alpha, mu1, mu2, sigma1, sigma2) {
    # Generating data
    treatment1_data <- rnorm(n = n1, mean = mu1, sd = sigma1)
    treatment2_data <- rnorm(n = n2, mean = mu2, sd = sigma2)
    
    # Calculating sample statistics
    x1_bar = mean(treatment1_data)
    x2_bar = mean(treatment2_data)
    
    s1 = sd(treatment1_data)
    s2 = sd(treatment2_data)
    
    sd = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2)))^(1/2)
    se = (((((n1 -1)*(s1^2)) + ((n2 -1)*(s2^2)))/(n1+n2-2))*((1/n1) + (1/n2)))^(1/2)
    df = n1 + n2 - 1
    observed_dfference = x1_bar - x2_bar
    z = (0 - (observed_dfference))/se
    
    # Calculating p-value
    p = pt(q = z, df = df, lower.tail = TRUE, log.p = FALSE)
#     p = pnorm(q = z, mean = 0, sd = sd, lower.tail = TRUE, log.p = FALSE)
    p = ifelse(p > 0.5, (1-p), p)*2
    
    # Calculating confidence interval
    critical_value = qt(p = alpha/2, df = df, lower.tail = TRUE, log.p = FALSE)
#     critical_value = qnorm(p = alpha/2, mean = observed_dfference, sd = sd, lower.tail = TRUE, log.p = FALSE)
    lower_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                     observed_dfference - (critical_value*se),
                     observed_dfference + (critical_value*se))
    upper_limit = ifelse(sign(critical_value) == 1 & sign (observed_dfference) == 1,
                     observed_dfference + (critical_value*se),
                     observed_dfference - (critical_value*se))
    ci = c(lower_limit, observed_dfference, upper_limit)
    
    # Results of the hypothesis test
    null_hypothesis_test_method <- ifelse(p <= alpha, "We reject the null hypothesis", "We fail to reject the null hypothesis")
    confidence_interval_method <- ifelse(ci[1] <= 0 & 0 <= ci[3],
       "The confidence interval contains zero",
       "The confidence interval (corresponding to chosen alpha) does not contain 0")
    
    ret <- list(ifelse(null_hypothesis_test_method == "We reject the null hypothesis" &
                  confidence_interval_method == "The confidence interval (corresponding to chosen alpha) does not contain 0", "Reject",
                  
                  ifelse(null_hypothesis_test_method == "We fail to reject the null hypothesis" &
                         confidence_interval_method == "The confidence interval contains zero", "Fail to reject",
                         "ERROR")),
                p = p,
                ci = ci,
                x1_bar = x1_bar,
                x2_bar = x2_bar,
                s1 = s1,
                s2 = s2,
                se = se)
    
    return(ret)
}

In [19]:
# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 12
mu2 = 11
sigma1 = 3
sigma2 = 3

two_sample_t(n1 = n1,
             n2 = n2,
             alpha = alpha,
             mu1 = mu1,
             mu2 = mu2,
             sigma1 = sigma1,
             sigma2 = sigma2)

In [20]:
# Iterate

# Parameters of the experiment
n1 = 1000
n2 = 1000
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
    ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

[[1]]
[1] "Reject"

$p
[1] 0.04729006

$ci
[1] -0.533240427 -0.268226038 -0.003211649

$x1_bar
[1] 10.95133

$x2_bar
[1] 11.21956

$s1
[1] 2.987032

$s2
[1] 3.055864

$se
[1] 0.135132

[[1]]
[1] "Reject"

$p
[1] 0.006452329

$ci
[1] -0.6376329 -0.3708826 -0.1041324

$x1_bar
[1] 10.7197

$x2_bar
[1] 11.09058

$s1
[1] 2.98983

$s2
[1] 3.092182

$se
[1] 0.1360172

[[1]]
[1] "Reject"

$p
[1] 0.0317272

$ci
[1] -0.54514184 -0.28504988 -0.02495791

$x1_bar
[1] 10.99483

$x2_bar
[1] 11.27988

$s1
[1] 2.95902

$s2
[1] 2.972006

$se
[1] 0.1326221

[[1]]
[1] "Reject"

$p
[1] 0.02669849

$ci
[1] -0.57125798 -0.30315390 -0.03504982

$x1_bar
[1] 10.88856

$x2_bar
[1] 11.19171

$s1
[1] 2.987701

$s2
[1] 3.124513

$se
[1] 0.1367075



results
Fail to reject         Reject 
            96              4 

In [21]:
unique(d)

ERROR: ignored

In [22]:
# Iterate

# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
    ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

[[1]]
[1] "Reject"

$p
[1] 0.02038729

$ci
[1] 0.001523773 0.009838668 0.018153564

$x1_bar
[1] 11.00501

$x2_bar
[1] 10.99518

$s1
[1] 2.999904

$s2
[1] 2.999712

$se
[1] 0.004242369

[[1]]
[1] "Reject"

$p
[1] 0.04461064

$ci
[1] 0.0002050516 0.0085183584 0.0168316651

$x1_bar
[1] 11.00113

$x2_bar
[1] 10.99261

$s1
[1] 3.000262

$s2
[1] 2.998207

$se
[1] 0.004241558

[[1]]
[1] "Reject"

$p
[1] 0.0450028

$ci
[1] 0.0001895973 0.0085095203 0.0168294432

$x1_bar
[1] 11.00405

$x2_bar
[1] 10.99554

$s1
[1] 3.00193

$s2
[1] 3.001314

$se
[1] 0.004244934

[[1]]
[1] "Reject"

$p
[1] 0.02370396

$ci
[1] -0.017914332 -0.009597746 -0.001281161

$x1_bar
[1] 10.99468

$x2_bar
[1] 11.00427

$s1
[1] 2.999693

$s2
[1] 3.001142

$se
[1] 0.004243231

[[1]]
[1] "Reject"

$p
[1] 0.01798894

$ci
[1] -0.01836275 -0.01004285 -0.00172294

$x1_bar
[1] 10.99596

$x2_bar
[1] 11.006

$s1
[1] 3.000402

$s2
[1] 3.002828

$se
[1] 0.004244925

[[1]]
[1] "Reject"

$p
[1] 0.01223184

$ci
[1] 0.002312151 0.010620835

results
Fail to reject         Reject 
            93              7 

In [23]:
# Iterate

# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 12
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
#     ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

results
Reject 
   100 

In [24]:
# Iterate

# Parameters of the experiment
n1 = 100
n2 = 100
alpha = 0.05

# Population parameters

mu1 = 12
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
#     ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

results
Fail to reject         Reject 
            33             67 

In [25]:
# Iterate

# Parameters of the experiment
n1 = 1000000
n2 = 1000000
alpha = 0.05

# Population parameters

mu1 = 11
mu2 = 11
sigma1 = 3
sigma2 = 3



results <- c()
for(i in 1:100){
    res = two_sample_t(n1 = n1,
                 n2 = n2,
                 alpha = alpha,
                 mu1 = mu1,
                 mu2 = mu2,
                 sigma1 = sigma1,
                 sigma2 = sigma2)
    results[i] = res[[1]]
    ifelse(results[i] == "Reject", print(res),NA)
}
table(results)

[[1]]
[1] "Reject"

$p
[1] 0.02748067

$ci
[1] -0.017671337 -0.009354739 -0.001038141

$x1_bar
[1] 10.99412

$x2_bar
[1] 11.00347

$s1
[1] 2.998412

$s2
[1] 3.002431

$se
[1] 0.004243238

[[1]]
[1] "Reject"

$p
[1] 0.04710852

$ci
[1] -0.0167584510 -0.0084330851 -0.0001077191

$x1_bar
[1] 10.99617

$x2_bar
[1] 11.0046

$s1
[1] 3.002929

$s2
[1] 3.004241

$se
[1] 0.004247711

[[1]]
[1] "Reject"

$p
[1] 0.04025563

$ci
[1] -0.0170346425 -0.0087108786 -0.0003871146

$x1_bar
[1] 10.99333

$x2_bar
[1] 11.00204

$s1
[1] 3.002304

$s2
[1] 3.00371

$se
[1] 0.004246894

[[1]]
[1] "Reject"

$p
[1] 0.04693487

$ci
[1] -0.0167350516 -0.0084246049 -0.0001141582

$x1_bar
[1] 11.00089

$x2_bar
[1] 11.00932

$s1
[1] 2.998222

$s2
[1] 2.998183

$se
[1] 0.004240099



results
Fail to reject         Reject 
            96              4 