# Доверительные интервалы для двух долей 

In [None]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

  import pandas.util.testing as tm


## Загрузка данных

In [None]:
path = 'https://raw.githubusercontent.com/chekhovana/courses/main/machine_learning/4_hypothesis_testing/data/1.4_banner_click_stat.txt'
data = pd.read_csv(path, header = None, sep = '\t')
data.columns = ['banner_a', 'banner_b']

In [None]:
data.head()

Unnamed: 0,banner_a,banner_b
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0


In [None]:
data.describe()

Unnamed: 0,banner_a,banner_b
count,1000.0,1000.0
mean,0.037,0.053
std,0.188856,0.224146
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


## Интервальные оценки долей

$$\frac1{ 1 + \frac{z^2}{n} } \left( \hat{p} + \frac{z^2}{2n} \pm z \sqrt{ \frac{ \hat{p}\left(1-\hat{p}\right)}{n} + \frac{z^2}{4n^2} } \right), \;\; z \equiv z_{1-\frac{\alpha}{2}}$$ 

In [None]:
conf_interval_banner_a = proportion_confint(sum(data.banner_a), 
                                            data.shape[0],
                                            method = 'wilson')
conf_interval_banner_b = proportion_confint(sum(data.banner_b), 
                                            data.shape[0],
                                            method = 'wilson')

In [None]:
print('interval for banner a [%f, %f]' % conf_interval_banner_a)
print('interval for banner b [%f, %f]' % conf_interval_banner_b)

interval for banner a [0.026961, 0.050582]
interval for banner b [0.040747, 0.068675]


### Как их сравнить?

## Доверительный интервал для разности долей (независимые выборки)

   | $X_1$ | $X_2$  
  ------------- | -------------|
  1  | a | b 
  0  | c | d 
  $\sum$ | $n_1$| $n_2$
  
$$ \hat{p}_1 = \frac{a}{n_1}$$

$$ \hat{p}_2 = \frac{b}{n_2}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\; \hat{p}_1 - \hat{p}_2 \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{\hat{p}_1(1 - \hat{p}_1)}{n_1} + \frac{\hat{p}_2(1 - \hat{p}_2)}{n_2}}$$

In [None]:
def get_confint_by_prob(p1, p2, n1, n2, alpha=0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    delta = z * np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
    left_boundary = (p1 - p2) - delta
    right_boundary = (p1 - p2) + delta
    
    return (left_boundary, right_boundary)

def proportions_confint_diff_ind(sample1, sample2, alpha = 0.05):    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    return get_confint_by_prob(p1, p2, len(sample1), len(sample2), alpha)
    

In [None]:
print("confidence interval: [%f, %f]" % proportions_confint_diff_ind(data.banner_a, data.banner_b))

confidence interval: [-0.034157, 0.002157]


In [None]:
print("confidence interval: [%f, %f]" % proportions_confint_diff_ind(data.banner_b, data.banner_a))

confidence interval: [-0.002157, 0.034157]


In [None]:
189 / 11034 - 104 / 11037
# 1 - плацебо, 2 - аспирин
n1, n2 = 11034, 11037
p1, p2 = 189 / n1, 104 / n2
print(get_confint_by_prob(189 / 11034, 104 / 11037, 11034, 11037))
print(p1, p2)
print(get_confint_by_prob(p1, p2, n1, n2))
print(get_confint_by_prob(p2, p1, n2, n1))

(0.004687750675049439, 0.010724297276960124)
0.017128874388254486 0.009422850412249705
(0.004687750675049439, 0.010724297276960124)
(-0.010724297276960124, -0.004687750675049439)


In [None]:
odds1,odds2 = p1 / (1 - p1), p2 / (1 - p2)
print(odds1, odds2, odds1 / odds2)

0.017427385892116183 0.009512485136741973 1.8320539419087138


## Доверительный интервал для разности долей (связанные выборки)

  $X_1$ \ $X_2$ | 1| 0 | $\sum$
  ------------- | -------------|
  1  | e | f | e + f
  0  | g | h | g + h
  $\sum$ | e + g| f + h | n  
  
$$ \hat{p}_1 = \frac{e + f}{n}$$

$$ \hat{p}_2 = \frac{e + g}{n}$$

$$ \hat{p}_1 - \hat{p}_2 = \frac{f - g}{n}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\;  \frac{f - g}{n} \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{f + g}{n^2} - \frac{(f - g)^2}{n^3}}$$

In [None]:
def proportions_confint_diff_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [None]:
print("confidence interval: [%f, %f]" % proportions_confint_diff_rel(data.banner_a, data.banner_b))

confidence interval: [-0.026689, -0.005311]


Среди респондентов General Social Survey 2014 года хотя бы раз в месяц проводят вечер в баре 203 женщины и 239 мужчин; реже, чем раз в месяц, это делают 718 женщин и 515 мужчин.

In [None]:
n1, n2 = 203 + 718, 239 + 515
p1, p2 = 203 / n1, 239 / n2
print(p1, p2)
print(get_confint_by_prob(p1, p2, n1, n2))
print(get_confint_by_prob(p2, p1, n2, n1))

0.22041259500542887 0.3169761273209549
(-0.13922183141523897, -0.053905233215813156)
(0.053905233215813156, 0.13922183141523897)


In [None]:
z = (p1 - p2) / np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
from scipy.stats import norm
norm.cdf(z) * 2

9.135870505976037e-06