In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import *

import scipy.stats as stats
import statsmodels.stats.weightstats as wstats
%matplotlib inline

In [None]:
!cat experiment_stats.csv | head

In [None]:
data = pd.read_csv('experiment_stats.csv')
data.head()

In [None]:
data.mean()

In [None]:
plt.figure(figsize=(15,8))
data.query('30000 > ord_value > 0').ord_value.hist(bins=200)
#data.query('variant == 0 & 50000 > ord_value > 0').ord_value.hist(bins=100)

In [None]:
# Для удобства добавим колонку "конверсия"
data['conversion'] = data['ord_value'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
data.groupby('variant').mean()

In [None]:
data.groupby('variant').conversion.count()

In [None]:
c0 = 100*data[data['variant'] == 0].expanding(1500).mean()['conversion'][:80000]
c1 = 100*data[data['variant'] == 1].expanding(1500).mean()['conversion'][:80000]

# create dashboard
fig, ax = plt.subplots(figsize=(14,6))    
plot(c0)
plot(c1)
title('Cumulative conversion rate', fontdict={'size':16})

xlabel('sessions')
ylabel('conversion')
plt.grid(True)
xlim(0, 150000)
ylim(3.7,4.8)

## Two Samples z-test for Proportions

## $z = \frac{\hat{p_1}-\hat{p_2}}{\sqrt{\hat{p} (1-\hat{p}) (\frac{1}{n_1} + \frac{1}{n_2})}} $
where

### $\hat{p_1} = \frac{x_1}{n_1}, \hat{p_2} = \frac{x_2}{n_2} $
### $\hat{p} = \frac{x_1 + x_2}{n_1 + n_2}$
$x_1, x_2$ - number of successes in group 1 and 2

$n_1, n_2$ - number of observations in group 1 and 2

In [None]:
# implementation from scratch
def ztest_proportion_two_samples(x1, n1, x2, n2, one_sided=False):
    p1 = x1/n1
    p2 = x2/n2    

    p = (x1+x2)/(n1+n2)
    denom = sqrt(p*(1-p)*(1/n1+1/n2))
    
    z = (p1-p2)/denom
    p = 1-stats.norm.cdf(abs(z))
    p *= 2-one_sided # if not one_sided: p *= 2
    
    print(xa, na, xb, nb)
    print('z-stat = {z}'.format(z=z))
    print('p-value = {p}'.format(p=p))
    
    return p

In [None]:
k = 100000
d = data.iloc[:k, :]

xa = d[d.variant == 0].conversion.sum()
na = d[d.variant == 0].conversion.count()

xb = d[d.variant == 1].conversion.sum()
nb = d[d.variant == 1].conversion.count()

p = ztest_proportion_two_samples(xa, na, xb, nb, one_sided=False)

In [None]:
# using statsmodels
from statsmodels.stats.proportion import proportions_ztest

count = np.array([xa, xb])
nobs = np.array([na, nb])
z,p = proportions_ztest(count, nobs, value=0, alternative='two-sided')
print(' z-stat = {z} \n p-value = {p}'.format(z=z,p=p))

## $CI = (\hat{p_1} - \hat{p_2}) ± z_{critical} \cdot SE $ 

$SE = \sqrt{ \frac{\hat{p_1}(1-\hat{p_1})}{n_1} + \frac{\hat{p_2}(1-\hat{p_2})}{n_2} }  $

where

$\hat{p_1}, \hat{p_2}$ - proportion in group 1 and 2


$n_1, n_2$ - number of observations in group 1 and 2

In [None]:
def compute_standard_error_prop_two_samples(x1, n1, x2, n2, alpha=0.05):
    p1 = x1/n1
    p2 = x2/n2    
    se = p1*(1-p1)/n1 + p2*(1-p2)/n2
    return sqrt(se)
    
def zconf_interval_two_samples(x1, n1, x2, n2, alpha=0.05):
    p1 = x1/n1
    p2 = x2/n2    
    se = compute_standard_error_prop_two_samples(x1, n1, x2, n2)
    z_critical = stats.norm.ppf(1-0.5*alpha)
    
    ci_low = p2-p1-z_critical*se
    ci_upp = p2-p1+z_critical*se
    
    print(' 95% Confidence Interval = ( {0:.2f}% , {1:.2f}% )'.format(100*ci_low, 100*ci_upp))
    return ci_low, ci_upp

In [None]:
ci_low, ci_upp = zconf_interval_two_samples(xa, na, xb, nb, alpha=0.05)

In [None]:
d = data
#d = data[data.ord_value < 100000]

c0 = d[d['variant'] == 0].expanding(15000).mean()['ord_value'][:1000000]
c1 = d[d['variant'] == 1].expanding(15000).mean()['ord_value'][:1000000]

# create dashboard
fig, ax = plt.subplots(figsize=(14,6))    
plot(c0)
plot(c1)
title('AOV', fontdict={'size':16})

xlabel('sessions')
ylabel('average ordered value')
plt.grid(True)
#xlim(0, 500000)
#ylim(3.7,4.8)

In [None]:
data.sort_values(by='ord_value',ascending=False).head()

In [None]:
from scipy.stats import mannwhitneyu
print(mannwhitneyu.__doc__)

In [None]:
d = data

test = d[d.variant == 1].ord_value
control = d[d.variant == 0].ord_value

mannwhitneyu(test, control, alternative='greater')[1]