In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import numpy as np
import random as rd

# How make a decision, if the means of two samples are significantly different?

## Generate a Population of 10,000,000 elements

In [2]:
np.random.seed(None)
pop = list(np.random.normal(100, 20, 10000000))

### Mean and standard deviation from the Population:

In [3]:
pop_mean = np.mean(pop)
pop_sd = np.std(pop)

pop_mean, pop_sd

(99.9899433671507, 19.999733628535555)

In [None]:
plt.figure()
plt.hist(pop, bins=100, density=True)
plt.axvline(x=pop_mean, color="orange")
plt.axvline(x=pop_mean - pop_sd, color="red")
plt.axvline(x=pop_mean + pop_sd, color="red")
plt.xlim(-50, 250)
plt.show()


## Drawing two random samples of 10 elements

In [None]:
sample_n = 30
x1 = rd.sample(pop, sample_n)
x2 = rd.sample(pop, sample_n)

### Mean and standard deviation from the samples:

In [None]:
x1_mean_hat = np.mean(x1)
x1_sd_hat = np.std(x1, ddof=1)

x1_mean_hat, x1_sd_hat

In [None]:
x2_mean_hat = np.mean(x2)
x2_sd_hat = np.std(x2, ddof=1)

x2_mean_hat, x2_sd_hat

In [None]:
plt.figure()
plt.hist(x1, bins=100, density=True)
plt.xlim(-50, 250)
plt.axvline(x=x1_mean_hat, color="orange")
plt.axvline(x=x1_mean_hat - x1_sd_hat, color="red")
plt.axvline(x=x1_mean_hat + x1_sd_hat, color="red")
plt.show()

In [None]:
plt.figure()
plt.hist(x2, bins=100, density=True)
plt.xlim(-50, 250)
plt.axvline(x=x2_mean_hat, color="orange")
plt.axvline(x=x2_mean_hat - x2_sd_hat, color="red")
plt.axvline(x=x2_mean_hat + x2_sd_hat, color="red")
plt.show()

## Drawing 100 independent random samples from the population

In [None]:
n_samples = 1000
xn_means =[np.mean(rd.sample(pop, sample_n)) for i in range(n_samples)]

### Mean and standard deviation from the sample means

In [None]:
xn_means_mean_hat = np.mean(xn_means)
xn_means_sd_hat = np.std(xn_means)

xn_means_mean_hat, xn_means_sd_hat 

In [None]:
plt.figure()
plt.hist(xn_means, bins=100, density=True)
plt.xlim(-50, 250)
plt.axvline(x=xn_means_mean_hat, color="orange")
plt.axvline(x=xn_means_mean_hat - xn_means_sd_hat, color="red")
plt.axvline(x=xn_means_mean_hat + xn_means_sd_hat, color="red")
plt.show()

## The standard deviation of the means can be derived differently and is generally called standard error!

### True standard error

In [None]:
x_se = pop_sd / np.sqrt(sample_n)

x_se

### Estimated standard error purely from sample

In [None]:
x1_se_hat = x1_sd_hat / np.sqrt(sample_n-1) # one df
x2_se_hat = x2_sd_hat / np.sqrt(sample_n-1)

x1_se_hat, x2_se_hat

In [None]:
x_se, xn_means_sd_hat, x1_se_hat, x2_se_hat

## We can derive from the sample itself the standard error and thereby the precision of the sample mean!