In [71]:
import pandas as pd
import numpy as np
import scipy

from math import sqrt
from scipy.stats import chi2_contingency

def matthews_corr(a, b, c, d):
    return (a * d - b * c) / sqrt((a + b) * (a + c) * (b + d) * (c + d))


def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [68]:
data = pd.read_table('illiteracy.txt')
data.head()

Unnamed: 0,Country,Illit,Births
0,Albania,20.5,1.78
1,Algeria,39.1,2.44
2,Bahrain,15.0,2.34
3,Belize,5.9,2.97
4,Benin,73.5,5.6


In [69]:
data.corr()

Unnamed: 0,Illit,Births
Illit,1.0,0.768663
Births,0.768663,1.0


In [5]:
data.corr(method='spearman')

Unnamed: 0,Illit,Births
Illit,1.0,0.752962
Births,0.752962,1.0


## Test

$\textbf{№1}$

In [8]:
data = pd.read_table('water.txt')
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [9]:
data.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


$\textbf{№2}$

In [10]:
data.corr(method='spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


$\textbf{№3}$

In [16]:
data_south = data[data.location == 'South']
data_north = data[data.location == 'North']
print data_south.corr()
print data_north.corr()

           mortality  hardness
mortality   1.000000 -0.602153
hardness   -0.602153  1.000000
           mortality  hardness
mortality   1.000000 -0.368598
hardness   -0.368598  1.000000


In [88]:
np.around(-0.3686, 5)

-0.36858999999999997

$\textbf{№4}$

In [62]:
print np.around(matthews_corr(515., 718, 239, 203), 3)

-0.109


$\textbf{№5}$

In [89]:
chi2_contingency([[515., 718], 
                  [239, 203]])[1]

1.0558987006638725e-05

$\textbf{№6}$

In [96]:
m_part = 239 / 515.
fem_part = 203 / 718.
print m_part - fem_part
man = [1 if i < 239 else 0 for i in range(515 + 239)]
fem = [1 if i < 203 else 0 for i in range(718 + 203)]
proportions_diff_confint_ind(man, fem)

0.181347864889


(0.053905233215813156, 0.13922183141523897)

$\textbf{№7}$

In [97]:
proportions_diff_z_test(proportions_diff_z_stat_ind(man, fem))

8.1534530895766011e-06

$\textbf{№8}$

In [80]:
chi = chi2_contingency([[197, 111, 33], 
                  [382, 685, 331], 
                  [110, 342, 333]])[0]
chi

293.68311039689746

$\textbf{№9}$

In [77]:
chi2_contingency([[197, 111, 33], 
                  [382, 685, 331], 
                  [110, 342, 333]])[1]

2.4964299580093467e-62

$\textbf{№10}$

In [85]:
n = np.sum([[197, 111, 33], 
                  [382, 685, 331], 
                  [110, 342, 333]])
sqrt(chi / (n * 2))

0.2412013934500338