# Bayesian ML
- http://fastml.com/bayesian-machine-learning/
- http://www.columbia.edu/~jwp2128/Teaching/E6720/BayesianModelsMachineLearning2016.pdf
- https://towardsdatascience.com/what-is-bayesian-statistics-used-for-37b91c2c257c
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html

In [9]:
import numpy as np
import pandas as pd
from scipy import stats

### T-Test
- generate random data
- get unbiased variance (N-1) with ddof=1
- get pooled std (https://www.statisticshowto.datasciencecentral.com/pooled-standard-deviation/)
- _

In [29]:
def t_test(a,b):
    from scipy.stats import t
    var_a = a.var(ddof=1)
    var_b = b.var(ddof=1)
    std_pooled = np.sqrt((var_a + var_b)/2)
    _t = (a.mean() - b.mean()) / (std_pooled * np.sqrt(2.0/N))
    df = 2*N-2
    p = 1 - t.cdf(_t, df=df)
    return _t,p

In [3]:
N = 10
a = np.random.randn(N) + 2
b = np.random.randn(N)


t,p = t_test(a,b)
print(t)
print("t:{:.2f}\tp:{:.6f}".format(t, 2*p))
#t2, p2 = stats.ttest_ind(a,b)
#print("t:{:.2f}\tp:{:.6f}".format(t2,p2))


2.724793685278926
t:2.72	p:0.013899


In [6]:
ads_df = pd.read_csv('advertisement_clicks.csv')
ads_df.head()

Unnamed: 0,advertisement_id,action
0,B,1
1,B,1
2,A,0
3,B,0
4,A,1


In [8]:
A_df = ads_df[ads_df['advertisement_id'] == 'A']
B_df = ads_df[ads_df['advertisement_id'] == 'B']
A = A_df['action']
B = B_df['action']

In [13]:
a_mu = A.mean()
b_mu = B.mean()

a_var = A.var()
b_var = B.var()

In [14]:
print(a_mu, b_mu)
print(a_var,b_var)

0.304 0.372
0.21179579579579275 0.23384984984985163


In [15]:
t, p = stats.ttest_ind(A,B)
print("T:{}\tp:{}".format(t,p))

T:-3.2211732138019786	p:0.0012971905467125246


In [16]:
t,p = stats.ttest_ind(A,B, equal_var=False)
print("welches t:{}\tp:{}".format(t,p))

welches t:-3.2211732138019786	p:0.0012972410374001632


In [30]:
t,p = t_test(A,B)
print(t,p)

18
-0.3221173213801983 0.7473975996951103


In [18]:
a = A
b = B

In [26]:
N1 = len(a)
s1_sq = a.var()
N2 = len(b)
s2_sq = b.var()
t = (a.mean() - b.mean()) / np.sqrt(s1_sq / N1 + s2_sq / N2)

nu1 = N1 - 1
nu2 = N2 - 1
df = (s1_sq / N1 + s2_sq / N2)**2 / ( (s1_sq*s1_sq) / (N1*N1 * nu1) + (s2_sq*s2_sq) / (N2*N2 * nu2) )
print(df)
p = (1 - stats.t.cdf(np.abs(t), df=df))*2
print("Manual Welch t-test")
print("t:\t", t, "p:\t", p)

1993.1187538343745
Manual Welch t-test
t:	 -3.221173213801983 p:	 0.001297241037400143
