# Statistical Tests

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.0
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
set.seed(100)

# One Sample t-test

Null Hypothesis: $\mu - \hat{\mu} = 0$

In [3]:
x1 <- rnorm(n = 50, mean = 10, sd = 0.5)
t.test(x1, mu = 10)


	One Sample t-test

data:  x1
t = 0.70372, df = 49, p-value = 0.4849
alternative hypothesis: true mean is not equal to 10
95 percent confidence interval:
  9.924374 10.157135
sample estimates:
mean of x 
 10.04075 


# Wilcoxon Signed Rank Test
Null Hypothesis: $m - \hat{m} = 0$

In [4]:
x2 <- c(20, 29, 24, 19, 20, 22, 28, 23, 19, 19)
wilcox.test(x2, mu=20, conf.int = TRUE)

“requested conf.level not achievable”
“cannot compute exact p-value with ties”
“cannot compute exact confidence interval with ties”
“cannot compute exact p-value with zeroes”
“cannot compute exact confidence interval with zeroes”



	Wilcoxon signed rank test with continuity correction

data:  x2
V = 30, p-value = 0.1056
alternative hypothesis: true location is not equal to 20
90 percent confidence interval:
 19.00006 25.99999
sample estimates:
(pseudo)median 
      23.00002 


# Two Sample Wilcoxon Test

Null Hypothesis: $m_1 - m_2 = 0$

In [5]:
x3 <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
x4 <- c(1.15, 0.88, 0.90, 0.74, 1.21)
wilcox.test(x3, x4, alternative = "g")


	Wilcoxon rank sum test

data:  x3 and x4
W = 35, p-value = 0.1272
alternative hypothesis: true location shift is greater than 0


# Two Sample t-Test

Null Hypothesis: $\mu_1 - \mu_2 = 0$

In [6]:
x5 <- rnorm(50, mean = 10, sd = 1.5)
x6 <- rnorm(50, mean = 10, sd = 0.5)

x7 <- rnorm(50, mean = 15, sd = 1.5)
x8 <- rnorm(50, mean = 5, sd = 1.5)

t.test(x5, x6)
t.test(x7, x8)


	Welch Two Sample t-test

data:  x5 and x6
t = -0.24561, df = 53.812, p-value = 0.8069
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5830865  0.4558216
sample estimates:
mean of x mean of y 
 9.886474  9.950107 



	Welch Two Sample t-test

data:  x7 and x8
t = 34.787, df = 86.831, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  9.721004 10.899201
sample estimates:
mean of x mean of y 
  15.1831    4.8730 


# Shapiro Test

Null Hypothesis: $X \sim \mathcal{N}$

In [7]:
# Not generated from a normal dist.
shapiro.test(x2)

# Generated from a normal dist.
shapiro.test(x8)

# Definitely not generated from a normal dist.
x9 <- runif(n = 50, min = 5, max = 7.5)
shapiro.test(x9)


	Shapiro-Wilk normality test

data:  x2
W = 0.84135, p-value = 0.04579



	Shapiro-Wilk normality test

data:  x8
W = 0.97511, p-value = 0.3684



	Shapiro-Wilk normality test

data:  x9
W = 0.93965, p-value = 0.01305


# Kolmogorov and Smirnov Test

Null Hypothesis: $X, Y \sim \Omega$ where $\Omega$ is some probability distribution

In [8]:
# Different types of distributions
x10 <- rnorm(n = 50, mean = 0, sd = 1)
x11 <- runif(n = 50, min = -1, max = 1)
ks.test(x10, x11)

# Same mean, different sd
x12 <- rnorm(n = 50, mean = 0, sd = 2)
ks.test(x10, x12)

# Different mean, same sd
x13 <- rnorm(n = 50, mean = 5, sd = 1)
ks.test(x10, x13)

# Same normal dist
ks.test(x10, x10)


	Two-sample Kolmogorov-Smirnov test

data:  x10 and x11
D = 0.16, p-value = 0.5487
alternative hypothesis: two-sided



	Two-sample Kolmogorov-Smirnov test

data:  x10 and x12
D = 0.28, p-value = 0.03919
alternative hypothesis: two-sided



	Two-sample Kolmogorov-Smirnov test

data:  x10 and x13
D = 1, p-value < 2.2e-16
alternative hypothesis: two-sided


“cannot compute exact p-value with ties”



	Two-sample Kolmogorov-Smirnov test

data:  x10 and x10
D = 0, p-value = 1
alternative hypothesis: two-sided


# Fisher's F-Test

Null Hypothesis: $var(\vec{X}_1) = var(\vec{X}_2)$

In [9]:
var.test(x10, x12)
var.test(x10, x13)


	F test to compare two variances

data:  x10 and x12
F = 0.15636, num df = 49, denom df = 49, p-value = 1.257e-09
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.0887287 0.2755300
sample estimates:
ratio of variances 
         0.1563567 



	F test to compare two variances

data:  x10 and x13
F = 0.838, num df = 49, denom df = 49, p-value = 0.5385
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.4755475 1.4767215
sample estimates:
ratio of variances 
         0.8380043 


# Chi Squared Test

Null Hypothesis: categorical variables are independent

In [32]:
bball_df <- tibble(
    "gte_7_ft" = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE),
    "pro_player" = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE)
)
bball_df

gte_7_ft,pro_player
<lgl>,<lgl>
True,True
False,False
True,True
False,False
True,False
False,True
True,True
True,False
False,False
False,False


In [33]:
chisq.test(table(bball_df))

“Chi-squared approximation may be incorrect”



	Pearson's Chi-squared test with Yates' continuity correction

data:  table(bball_df)
X-squared = 1.3672, df = 1, p-value = 0.2423


In [31]:
table(bball_df)

        pro_player
gte_7_ft FALSE TRUE
   FALSE     5    1
   TRUE      3    4

# Correlation Test

Null Hypothesis: $cor(X_1, X_2) = 0$

In [36]:
cars_df <- as_tibble(cars)
cor.test(cars$dist, cars$speed)


	Pearson's product-moment correlation

data:  cars$dist and cars$speed
t = 9.464, df = 48, p-value = 1.49e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6816422 0.8862036
sample estimates:
      cor 
0.8068949 


# Fisher's Exact Test

Null Hypothesis: rows and columns are independent

In [39]:
athlete_df <- tibble(
    "smoker" = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE),
    "athlete" = c(FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE)
)
athlete_df

smoker,athlete
<lgl>,<lgl>
True,False
True,True
False,True
False,True
True,False
False,False
True,False
False,True
True,False


In [42]:
fisher.test(table(athlete_df))
fisher.test(table(bball_df))


	Fisher's Exact Test for Count Data

data:  table(athlete_df)
p-value = 0.2063
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.001288082 3.255946155
sample estimates:
odds ratio 
 0.1196876 



	Fisher's Exact Test for Count Data

data:  table(bball_df)
p-value = 0.1375
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
   0.4514166 469.3251580
sample estimates:
odds ratio 
  7.054394 
