In [5]:
import numpy as np
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic
import pandas as pd

In [2]:
!head -n 5 water.txt

location	town	mortality	hardness
South	Bath	1247	105
North	Birkenhead	1668	17
South	Birmingham	1466	5
North	Blackburn	1800	14


In [34]:
data = []
with open("water.txt", "r") as f:
    for line in f:
        line = line.split()
        if len(line) == 5:
            line = [line[0], " ".join([line[1], line[2]]), line[3], line[4]]
        data.append(line)

data = pd.DataFrame(data[1:], columns=data[0])
data.mortality = data.mortality.astype(float)
data.hardness = data.hardness.astype(float)
data.head(3)

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247.0,105.0
1,North,Birkenhead,1668.0,17.0
2,South,Birmingham,1466.0,5.0


# 95% доверительные интервалы для годовой смертности для северных и южных городов

In [55]:
select = data.mortality
std = select.std(ddof=1) / np.sqrt(len(select))
conf_int = _tconfint_generic(select.mean(), std, len(select) - 1, 0.05, 'two-sided')
print("95% conf. int. for mean annual mortality:", conf_int)
print(round(conf_int[0], 4))

95% conf. int. for mean annual mortality: (1476.0833413552848, 1572.2117406119285)
1476.0833


In [64]:
select = data[data.location == 'South'].mortality
std = select.std(ddof=1) / np.sqrt(len(select))
conf_int = _tconfint_generic(select.mean(), std, len(select) - 1, 0.05, 'two-sided')
print("95% conf. int. for mean annual mortality (only south cities):", conf_int)
print(round(conf_int[1], 4))

95% conf. int. for mean annual mortality (only south cities): (1320.1517462936238, 1433.463638321761)
1433.4636


In [58]:
select = data[data.location == 'North'].mortality
std = select.std(ddof=1) / np.sqrt(len(select))
conf_int = _tconfint_generic(select.mean(), std, len(select) - 1, 0.05, 'two-sided')
print("95% conf. int. for mean annual mortality (only North cities):", conf_int)

95% conf. int. for mean annual mortality (only North cities): (1586.5605251961385, 1680.6394748038613)


# 95% доверительные интервалы для жесткости воды для северных и южных городов

In [60]:
select = data[data.location == 'North'].hardness
std = select.std(ddof=1) / np.sqrt(len(select))
conf_int = _tconfint_generic(select.mean(), std, len(select) - 1, 0.05, 'two-sided')
print("95% conf. int. for hardness of water (only North cities):", conf_int)

95% conf. int. for hardness of water (only North cities): (21.42248728572426, 39.37751271427574)


In [61]:
select = data[data.location == 'South'].hardness
std = select.std(ddof=1) / np.sqrt(len(select))
conf_int = _tconfint_generic(select.mean(), std, len(select) - 1, 0.05, 'two-sided')
print("95% conf. int. for hardness of water (only south cities):", conf_int)

95% conf. int. for hardness of water (only south cities): (53.467198692036106, 86.07126284642544)


#### Вспомним формулу доверительного интервала для среднего нормально распределённой случайной величины с дисперсией σ2: При σ=1 какой нужен объём выборки, чтобы на уровне доверия 95% оценить среднее с точностью ±0.1?


In [65]:
from scipy import stats
np.ceil((stats.norm.ppf(1-0.05/2) / 0.01)**2 * 0.02 * 0.98)

753.0

In [63]:
stats.norm.ppf(1-0.05/2)

1.959963984540054