In [55]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from math import sqrt

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_stat_rel(sample1, sample2):
    sample = zip(sample1, sample2)
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)
    
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = zip(sample1, sample2)
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

$\textbf{№1}$
Lagr.

$\textbf{№2}$ V


$\textbf{№3}$

In [67]:
exp = [1 if i < 10 else 0 for i in range(34)]
test = [1 if i < 4 else 0 for i in range(16)]
z = proportions_diff_z_stat_ind(exp, test)
proportions_diff_z_test(z, 'greater')

0.37293045872523534

In [61]:
proportions_diff_z_test(proportions_diff_z_stat_ind([1]*4 + [0]*12, [1]*10 + [0]*24), 'greater')

0.62706954127476466

$\textbf{№4}$

In [10]:
data = pd.read_table('banknotes.txt')
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [15]:
from sklearn.cross_validation import train_test_split
X, y = data.drop(columns='real'), data['real']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=50)

50


In [30]:
from sklearn.linear_model import LogisticRegression
linear_model_1 = LogisticRegression()
linear_model_1.fit(X_train[['X1', 'X2', 'X3']], y_train)

linear_model_2 = LogisticRegression()
linear_model_2.fit(X_train[['X4', 'X5', 'X6']], y_train)
pass

In [33]:
from sklearn.metrics import accuracy_score
lm1_predict = linear_model_1.predict(X_test[['X1', 'X2', 'X3']])
lm2_predict = linear_model_2.predict(X_test[['X4', 'X5', 'X6']])
print accuracy_score(y_test, lm1_predict)
print accuracy_score(y_test, lm2_predict) 

0.8
0.98


In [40]:
lm1_err = [1 if lm1_predict[i] == y_test.iloc[i] else 0 for i in range(y_test.shape[0])]
lm2_err = [1 if lm2_predict[i] == y_test.iloc[i] else 0 for i in range(y_test.shape[0])]

In [49]:
proportions_diff_z_test(proportions_diff_z_stat_rel(lm1_err, lm2_err))

0.0032969384555543435

$\textbf{№5}$

In [52]:
proportions_diff_confint_rel(lm1_err, lm2_err)

(-0.30005479372038568, -0.059945206279614305)

$\textbf{№6}$

In [57]:
def criteria(mean, teta, n, u):
    return(mean - u) / (teta / sqrt(n))

cr_st = criteria(525, 100, 100, 541.4)
print np.around(stats.norm.cdf(cr_st), 4)

0.0505


$\textbf{№7}$

In [58]:
def criteria(mean, teta, n, u):
    return(mean - u) / (teta / sqrt(n))

cr_st = criteria(525, 100, 100, 541.5)
print np.around(stats.norm.cdf(cr_st), 4)

0.0495
