In [1]:
# PACKAGES
%matplotlib inline
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import random as rd
import statistics as st
import pandas as pd

# SEABORN THEME
scale = 0.4
W = 16*scale
H = 9*scale
sns.set(rc = {'figure.figsize':(W,H)})
sns.set_style("white")

- At the end of last class we tested 
<br><br>
$$H_0:\mu=67.5$$
<br>
$$H_1:\mu \neq 67.5$$
<br>
    with (i) known $\sigma$ 9.5, (ii) unknown $\sigma$. We have used test statistics and p values to reject H0 with confidence level 95%. 
- Exercises:
    1. Test 
<br><br>
$$H_0:\mu=0$$
<br>
$$H_1:\mu \neq 0$$
<br>
        by computing test statistics and p values manually (as done in class) and for (ii) check that test statistic and p value are the same ones of the ones obtained with scipy.stats.ttest_1samp(a = grades, popmean = mu_null).
    2. Do the same test by computing the confidence interval (instead of test statistics and p values) manually and for (ii) check the confidence interval is the same of the one obtained with scipy.stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_standard_error).

- Solution 1:

In [2]:
# get data
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/pythonbook/main/Data/zeppo.csv")
df.head()
grades = df['grades']

# set params
sd_true = 9.5
mu_null = 0
N = len(grades)

# get sample mean and sem_true
sample_mean = st.mean(grades)
sem_1 = sd_true / np.sqrt(N)

# get z score
z_score_1 = (sample_mean - mu_null) / sem_1

# get p value
lower_area_1 = st.NormalDist().cdf(-z_score_1)
upper_area_1 = lower_area_1
p_value_1 = lower_area_1 + upper_area_1

# display results
(z_score_1, p_value_1)

(34.03530837331259, 0.0)

In [3]:
# set params
degrees_freedom = N-1

# get sample mean, sample variance and sem
deviations = (grades - sample_mean)
deviations2 = np.square(deviations)
S2 = np.sum(deviations2)/degrees_freedom
S = np.sqrt(S2)
sem_2 = S / np.sqrt(N)

# get z score
z_score_2 = (sample_mean - mu_null) / sem_2

# get p value
lower_area_2 = stats.t.cdf(-z_score_2, df = degrees_freedom)
upper_area_2 = lower_area_2
p_value_2 = lower_area_2 + upper_area_2

# display results
(z_score_2, p_value_2)

(33.96161255929189, 1.7866093693224965e-18)

In [4]:
# compare to ttest_1samp
z_score_2_autom, p_value_2_autom = stats.ttest_1samp(a = grades, popmean = mu_null)
print(z_score_2_autom, p_value_2_autom)
z_score_2 == z_score_2_autom, p_value_2 == p_value_2_autom

33.96161255929189 1.7866093693224965e-18


(True, True)

- Solution 2:
    - Under $H_0$ with $\mu_0=0$ and known variance:
    <br>
    $$\overline{X} - (1.96 \times SEM) \leq \mu_0 \leq \overline{X} + (1.96 \times SEM)$$
    <br>
    $$\Rightarrow \overline{X} - (1.96 \times SEM) \leq 0 \leq \overline{X} + (1.96 \times SEM)$$
    <br>

In [5]:
a = 1.96
ci_1 = (sample_mean-(a*sem_1), sample_mean+(a*sem_1))
ci_1

(68.13644142589538, 76.46355857410461)

- Under $H_0$ with $\mu_0=0$ and unknown variance.
We need to get the critical values for the areas with cumulative probability $0.025$ and $0.975$. We can do it  for just one of the 2, say $0.975$, as the distribution is symmetric:

In [6]:
stats.t.ppf(0.975, degrees_freedom)

2.093024054408263

- Under $H_0$ with $\mu_0=0$, unknown variance and 19 degrees of freedom:
    <br>
    $$\overline{X} - (2.09 \times SEM) \leq \mu_0 \leq \overline{X} + (2.09 \times SEM)$$
    <br>
    $$\Rightarrow \overline{X} - (2.09 \times SEM) \leq 0 \leq \overline{X} + (2.09 \times SEM)$$
    <br>

In [7]:
alpha = 0.05
alpha_inv = (1.0-alpha)
q1 = (1+alpha_inv)/2
a = stats.t.ppf(q1, degrees_freedom)
ci_2 = (sample_mean-(a*sem_2), sample_mean+(a*sem_2))
ci_2

(67.84421513791415, 76.75578486208585)

In [8]:
ci_2_autom = stats.t.interval(alpha_inv, degrees_freedom, sample_mean, sem_2)
print(ci_2_autom)
ci_2 == ci_2_autom

(67.84421513791415, 76.75578486208585)


True

In [9]:
help(stats.t.interval)

Help on method interval in module scipy.stats._distn_infrastructure:

interval(alpha, *args, **kwds) method of scipy.stats._continuous_distns.t_gen instance
    Confidence interval with equal areas around the median.
    
    Parameters
    ----------
    alpha : array_like of float
        Probability that an rv will be drawn from the returned range.
        Each value should be in the range [0, 1].
    arg1, arg2, ... : array_like
        The shape parameter(s) for the distribution (see docstring of the
        instance object for more information).
    loc : array_like, optional
        location parameter, Default is 0.
    scale : array_like, optional
        scale parameter, Default is 1.
    
    Returns
    -------
    a, b : ndarray of float
        end-points of range that contain ``100 * alpha %`` of the rv's
        possible values.



In [10]:
?? stats._continuous_distns.t_gen.interval