In [1]:
from math import sqrt

from pandas import read_csv
from scipy import stats
from sklearn import linear_model, model_selection
from statsmodels.stats.proportion import proportion_confint

### 1

Версия множителей Лагранжа.

### 2

О количестве несовпадающих пар значений признаков и объеме выборок.

### 3

In [4]:
def proportions_diff_z_stat_ind(s1, n1, s2, n2):
    p1 = s1 / n1
    p2 = s2 / n2 
    P = float(p1 * n1 + p2 * n2) / (n1 + n2)
    
    return (p1 - p2) / sqrt(P * (1 - P) * (1. / n1 + 1. / n2))


def proportions_diff_z_test(z_stat, alternative='two-sided'):
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)


# Левая односторонняя гипотеза: первые два параметра относятся к экспериментальной группе и
# по определению эксперимента они не могут зевать меньше.
round(proportions_diff_z_test(proportions_diff_z_stat_ind(4, 16, 10, 34), 'less'), 4)

0.37290000000000001

### 4

In [19]:
def proportions_diff_z_stat_rel(f: float, g: float, n: int) -> float:
    return (f - g) / sqrt(f + g - (f - g) ** 2 / n)

In [21]:
banknotes = read_csv('banknotes.txt', sep='\t')

X = banknotes[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']]
y = banknotes['real']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=50, random_state=1)

errors_1 = linear_model.LogisticRegression() \
    .fit(X_train[['X1', 'X2', 'X3']], y_train) \
    .predict(X_test[['X1', 'X2', 'X3']]) != y_test
errors_2 = linear_model.LogisticRegression() \
    .fit(X_train[['X4', 'X5', 'X6']], y_train) \
    .predict(X_test[['X4', 'X5', 'X6']]) != y_test

f = ((errors_1 == 1) & (errors_2 == 0)).sum()
g = ((errors_1 == 0) & (errors_2 == 1)).sum()
    
p_value = proportions_diff_z_test(proportions_diff_z_stat_rel(f, g, 50))
f'{p_value:E}'

'3.296938E-03'

### 5

In [26]:
def proportions_diff_confint_rel(f: float, g: float, n: int, alpha=0.05) -> (float, float):
    z = stats.norm.ppf(1 - alpha / 2.)
    mu = (f - g) / n
    interval = z * sqrt((f + g) / n ** 2 - (f - g) ** 2 / (n ** 3))
    return (mu - interval, mu + interval)


left, right = proportions_diff_confint_rel(f, g, 50)
print(left, right)
round(left, 4)

0.0599452062796 0.30005479372


0.059900000000000002

### 6

In [27]:
# Просто одновыборочный Z-критерий (дисперсия известна).
z = (541.4 - 525) / (100 / sqrt(100))
round(1.0 - stats.norm.cdf(abs(z)), 4)

0.050500000000000003

### 7

In [28]:
z = (541.5 - 525) / (100 / sqrt(100))
round(1.0 - stats.norm.cdf(abs(z)), 4)

0.049500000000000002