**Корректность проверена на Python 3.7:**
+ pandas 0.23.0
+ numpy 1.14.5
+ sklearn 0.19.1

# Доверительные интервалы для оценки среднего

In [5]:
from sklearn import model_selection, datasets, linear_model, metrics

import numpy as np
import pandas as pd

In [2]:
import sklearn
print(np.__version__)
print(sklearn.__version__)

1.16.4
0.21.2


In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Загрузка данных

In [14]:
data = pd.read_csv('water.txt', delimiter='\t')
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


### Оценка среднего

In [27]:
sgd_auc_scores = model_selection.cross_val_score(linear_model.SGDClassifier(max_iter = 1000), 
                                 blobs[0], blobs[1], scoring = 'roc_auc', cv = 20)

In [14]:
ridge_auc_scores = model_selection.cross_val_score(linear_model.RidgeClassifier(), 
                                 blobs[0], blobs[1], scoring = 'roc_auc',  
                                 cv = 20)

### Расчет доверительных интервалов

In [74]:
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

def calculate_interval(array, n=0):
    mean = array.mean()
    std = array.std(ddof=1)/sqrt(len(array))
    intervals =  _tconfint_generic(mean, std,
                                   len(data['mortality']) - 1,
                                   0.05, 'two-sided')
    print(intervals)
    print(round(intervals[n],4))
    
    
print('Total')
calculate_interval(data.mortality, 0)

print('South')
calculate_interval(data[data.location=='South'].mortality, 1)

print('North')
calculate_interval(data[data.location=='North'].mortality, 0)

Total
(1476.0833413552848, 1572.2117406119285)
1476.0833
South
(1321.7814024572633, 1431.8339821581214)
1431.834
North
(1587.2999812165915, 1679.9000187834083)
1587.3


61


In [40]:
calculate_interval(data.hardness[data.location=='South'])

calculate_interval(data.hardness[data.location=='North'])

69.76923076923077 7.9153808776248935
(53.93611164687147, 85.60234989159008)
85.6023
30.4 4.417535721274871
(21.563612922286836, 39.23638707771316)
39.2364


### Интервальная оценка среднего 

In [16]:
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

In [17]:
sgd_mean = sgd_auc_scores.mean()
ridge_mean = ridge_auc_scores.mean()

#### z-интервал

Допустим, нам откуда-то известно, что дисперсия auc_scores $\sigma^2=0.25$. Построим доверительные интервалы для средних вида $$\bar{X}_n \pm z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}$$

In [80]:
print("sgd model mean auc 95%% confidence interval", _zconfint_generic(100, 
                                                                  sqrt(1/385), 
                                                                  0.05, 'two-sided'))

sgd model mean auc 95%% confidence interval (99.90011098599602, 100.09988901400398)


#### t-интервал

Вместо гипотетической теоретической дисперсии $\sigma^2$, которую мы на самом деле в данном случае не знаем, используем выборочные дисперсии, и построим доверительные интервалы вида $$\bar{X}_n \pm t_{1-\frac{\alpha}{2}} \frac{S}{\sqrt{n}}$$

In [19]:
type(sgd_auc_scores)

numpy.ndarray

In [20]:
sgd_mean_std = sgd_auc_scores.std(ddof=1)/sqrt(len(sgd_auc_scores))
ridge_mean_std = ridge_auc_scores.std(ddof=1)/sqrt(len(ridge_auc_scores))

In [21]:
print("sgd model mean auc 95%% confidence interval", _tconfint_generic(sgd_mean, sgd_mean_std,
                                                                       len(sgd_auc_scores) - 1,
                                                                       0.05, 'two-sided'))

print("ridge model mean auc 95%% confidence interval", _tconfint_generic(ridge_mean, ridge_mean_std,
                                                                         len(sgd_auc_scores) - 1,
                                                                         0.05, 'two-sided'))

sgd model mean auc 95%% confidence interval (0.8988783333963276, 0.9652479421138765)
ridge model mean auc 95%% confidence interval (0.9228783259302369, 0.9731038169269057)


In [93]:
import scipy
# q 1
print('1 ', round(scipy.stats.norm.ppf(1-(1-0.997)/2), 4))

# q 5
p1 = 189/11034
p2 = 104/11037
print('5 ', round((p1 - p2), 4))

# q 6
def proportions_confint_diff_ind(success1, number1, success2, number2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)   
    p1 = success1 / number1
    p2 = success2 / number2
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ number1 + p2 * (1 - p2)/ number2)
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ number1 + p2 * (1 - p2)/ number2)
    
    return (round(left_boundary, 4), round(right_boundary, 4))

print('6 ', proportions_confint_diff_ind(189, 11034, 104, 11037))

# q 7
odd1 = 189/(11034-189)
odd2 = 104/(11037-104)
print('7 ', round(odd1/odd2, 4))

1  2.9677
5  0.0077
6  (0.0047, 0.0107)
7  1.8321
