In [2]:
# import display libs
from IPython.display import Image
%matplotlib inline
from IPython.display import Latex

In [3]:
# import libs
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


# setup figure params
figureparams = {'axes.labelsize': 24,
           'axes.titlesize': 20,
           'axes.linewidth': 1.3,
           'font.size': 20,
           'legend.fontsize': 18,
           'figure.figsize': (10,7),
           'font.family': 'serif',
           'font.serif': 'Computer Modern Roman',
           'xtick.labelsize': 18,
           'xtick.major.size': 5.5,
           'xtick.major.width': 1.3,
           'ytick.labelsize': 18,
           'ytick.major.size': 5.5,
           'ytick.major.width': 1.3,
           'text.usetex': True,
           'figure.autolayout': True}
plt.rcParams.update(figureparams)
matplotlib.rcParams['text.usetex']=True
matplotlib.rcParams['text.latex.unicode']=True
matplotlib.get_configdir()

The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.


'C:\\Users\\EdwardJansenADC\\.matplotlib'

# Binomial test
### Contents
    1. Binomial test
    2. Confidence Intervals
        a. Clopper-Pearson
        b. Bootstrap
    3. Smth else?

## 1. Binomial test

Expl

In [7]:
# One-sided Binomial test (greater)
N = 100         # number of obs
k = 10          # number of defaults
p = 0.05        # estimated default probability

def binom_test_greater(N, k, p):
    probs = []
    for i in range(k, N+1):
        fact = np.math.factorial(N) / ( np.math.factorial(i) * np.math.factorial(N-i) )
        tmp = fact * p**i * (1-p)**(N-i)
        probs.append(tmp)
    
    return np.sum(probs)
        
p_val = binom_test_greater(N, k, p)


# let's check with statsmodels
from statsmodels.stats.proportion import binom_test
p_val_scipy = binom_test(k, N, p, alternative='larger')

print("Our p-value: " + str(p_val))
print("Scipys p-value: " + str(p_val_scipy))

Our p-value: 0.028188294163416006
Scipys p-value: 0.028188294163416006


In [8]:
# One-sided Binomial test (smaller)
N = 100         # number of obs
k = 10          # number of defaults
p = 0.15        # estimated default probability

def binom_test_smaller(N, k, p):
    probs = []
    for i in range(0, k+1):
        fact = np.math.factorial(N) / ( np.math.factorial(i) * np.math.factorial(N-i) )
        tmp = fact * p**i * (1-p)**(N-i)
        probs.append(tmp)
    
    return np.sum(probs)
        
p_val = binom_test_smaller(N, k, p)


# let's check with statsmodels
from statsmodels.stats.proportion import binom_test
p_val_scipy = binom_test(k, N, p, alternative='smaller')

print("Our p-value: " + str(p_val))
print("Scipys p-value: " + str(p_val_scipy))

Our p-value: 0.09944740049806844
Scipys p-value: 0.09944740049806884


Expl

In [9]:
# Two-sided Binomial test
# tests whether die at giving deviation (np.abs(k - N*p)) is giving too few or too much 6s.
# NOTE: if k is explicitly known (instead of deviation), then the one-sided test is more appropriate
# die example; we expect 6 has prob 1/6, then
N = 235     # number of dice throws
k = 51      # number of sixes (we would expect an average of 235*1/6 = 39.16666666)
p = 1/6     # H0

def binom_test_two_sided(N, k, p):
    probs = []
    dev = np.abs(k - N*p) 
    left_tail = np.round( N*p - dev ).astype(int)
    right_tail = np.round( N*p + dev ).astype(int)
    
    for i in range(0, left_tail+1):
        fact = np.math.factorial(N) / ( np.math.factorial(i) * np.math.factorial(N-i) )
        tmp = fact * p**i * (1-p)**(N-i)
        probs.append(tmp)        
    for i in range(right_tail, N+1):
        fact = np.math.factorial(N) / ( np.math.factorial(i) * np.math.factorial(N-i) )
        tmp = fact * p**i * (1-p)**(N-i)
        probs.append(tmp)           
    return np.sum(probs)
        
p_val = binom_test_two_sided(N, k, p)


# let's check with statsmodels
from statsmodels.stats.proportion import binom_test
p_val_scipy = binom_test(k, N, p, alternative='two-sided')

print("Our p-value: " + str(p_val))
print("Scipys p-value: " + str(p_val_scipy))

Our p-value: 0.04374797018240007
Scipys p-value: 0.043747970182413345


## 2. Confidence intervals

Expl

#### 2a. Clopper-Pearson

In [10]:
from scipy.stats import beta

alpha = 0.05
CI_low = beta.ppf(alpha/2, k, N-k+1)
CI_up = beta.ppf(1-alpha/2, k+1, N-k)
print("with 95% conf., real p lies within: (" + str(np.round(CI_low, 2)) + ", " + str(np.round(CI_up,2)) + ")")
print("observed p: "+ str(np.round(k/N,2)))

with 95% conf., real p lies within: (0.17, 0.28)
observed p: 0.22


#### 2b. Bootstrap

In [11]:
N = 1000
p = .32
rand_data = np.random.rand(N)
idx = np.where(rand_data < p)[0]
obs = np.zeros(N)                                 
obs[idx] = 1 

In [12]:
# let's check our expected probability
k = np.sum(obs)
p_expected = k / N

In [13]:
# let's use Clopper-Pearson to create a reference
alpha = 0.05
CI_low = beta.ppf(alpha/2, k, N-k+1)
CI_up = beta.ppf(1-alpha/2, k+1, N-k)
print("with 95% conf., real p lies within: (" + str(np.round(CI_low, 2)) + ", " + str(np.round(CI_up,2)) + ")")
print("observed p: "+ str(np.round(k/N,2)))  

with 95% conf., real p lies within: (0.32, 0.38)
observed p: 0.35


In [25]:
# let's now bootstrap the CI
def bootstrap(data, n=1000, func=np.sum):
    """
    Generate `n` bootstrap samples, evaluating `func`
    at each resampling. `bootstrap` returns a function,
    which can be called to obtain confidence intervals
    of interest.
    """
    simulations = list()
    sample_size = len(data)
    for i in range(n):
        itersample = np.random.choice(data, size=sample_size, replace=True)
        simulations.append(func(itersample) / sample_size)
    simulations.sort()
    return simulations

def confidence_interval(sim, alpha):
    """
    Return 2-sided symmetric confidence interval specified
    by p.
    """
    n = len(sim)
    u_pval = 1 - alpha/2
    l_pval = alpha / 2
    l_idx = int(np.floor(n*l_pval))
    u_idx = int(np.floor(n*u_pval))
    return(sim[l_idx], sim[u_idx])

In [26]:
# run the bootstrap
sim = bootstrap(obs)
bounds = confidence_interval(sim, alpha = 0.05)

print("Bootstrap: real p lies within: (" + str(bounds[0]) + ", " + str(bounds[1]) + ")")
print("Clopper-Pearson:, real p lies within: (" + str(np.round(CI_low, 2)) + ", " + str(np.round(CI_up,2)) + ")")
print("Observed p: "+ str(np.round(k/N,2)))  

Bootstrap: real p lies within: (0.319, 0.379)
Clopper-Pearson:, real p lies within: (0.32, 0.38)
Observed p: 0.35
