## Extra Credit: [Noise in GMM-EM]
Modify your GMM-EM routine by sampling and **injecting Gaussian noise** into the old faithful data at each iteration. **Scale the noise** to a fraction of the standard deviation in each dimension. And let the **noise standard deviation decay** at each iteration (e.g. inversely proportional to the square of the iteration counter).
Compare **the average convergence time** of the GMM-EM with and without noise. **Plot the average convergence time** for different initial noise standard deviations.


> - 收敛速度并没有加快
- 如何推到迭代公式
- 如何inject noise，限制条件怎么使用
   

 $$
 z_i = y_i + n_i\\
 n_i[n_i - 2(\mu _{j_i} - y_i)] \leq 0\\
 \alpha _j(t+1) = \frac{1}{N}\sum_{i=1}^{N}p_z(j|y_i,\Theta (t))\\
 \mu _j(t+1) = \frac{\sum_{i=1}^{N} p_z(j|y_i, \Theta(t)) z_i}{\sum _{i=1}^{N} p_z (j|y_i, \Theta (t))}\\
 \Sigma _j (t+1) = \frac{\sum _{i=1}^{N} p_z (j|y_i,\Theta (t))(z_i - \mu _j(t)) (z_i - \mu _j (t))^T}{\sum _{i=1}^{N} p_z (j|y_i, \Theta (t))}
 $$

In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
from scipy.stats import multivariate_normal
from sklearn.cluster import KMeans

In [2]:
# import the data from data.txt.
data = read_csv("data.txt", delim_whitespace=True, skipinitialspace=True)
xy = np.array([data['eruptions'],data['waiting']]).T

In [3]:
def noise_gen(var, n, mean1, mean2, xy):
    #nx,ny = np.random.multivariate_normal((0,0), cov , n).T
    nx = np.random.normal(0,var[0],n)
    ny = np.random.normal(0,var[1],n)
    is_ok_x_1 = nx*(nx-2*(mean1[0] - xy[:,0])) > 0
    is_ok_y_1 = ny*(ny-2*(mean1[1] - xy[:,1])) > 0
    is_ok_x_2 = nx*(nx-2*(mean2[0] - xy[:,0])) > 0
    is_ok_y_2 = ny*(ny-2*(mean2[1] - xy[:,1])) > 0
    is_ok_1 = np.logical_or(is_ok_x_1,is_ok_y_1)
    is_ok_2 = np.logical_or(is_ok_x_2,is_ok_y_2)
    is_ok = np.logical_or(is_ok_2,is_ok_2)
    #nx_t,ny_t = np.random.multivariate_normal((0,0), cov , len(xy)).T
    nx_t = np.random.normal(0,var[0],n)
    ny_t = np.random.normal(0,var[1],n)
    #print 'noise useless rate:',sum(is_ok)/float(n)
    nx[is_ok] = 0
    ny[is_ok] = 0
    return nx,ny

In [4]:
model = KMeans(n_clusters=2)
model.fit(xy)
mean1_t = model.cluster_centers_[0]
mean2_t = model.cluster_centers_[1]
cov1_t = np.ma.cov(xy.T)
#var_n = noise_level* cov1_t
var_n = np.array([np.ma.cov(xy[:,0]),np.ma.cov(xy[:,1])])
cov2_t = cov1_t
p_t = np.random.rand()

In [13]:
def my_nem_2d2pgmm(xy,noise_level):
    # Initialize the parameters.
    model = KMeans(n_clusters=2)
    model.fit(xy)
    mean1_t = model.cluster_centers_[0]
    mean2_t = model.cluster_centers_[1]
    cov1_t = np.ma.cov(xy.T)
    #var_n = noise_level* cov1_t
    var_n = np.array([np.ma.cov(xy[:,0]),np.ma.cov(xy[:,1])])
    cov2_t = cov1_t
    p_t = np.random.rand()
    # EM interation
    MAXITERATION = 10000
    tol = 0.001
    theta = np.r_[p_t, mean1_t, mean2_t, cov1_t.reshape(-1),cov2_t.reshape(-1)]
    iternum = 1
    p_t_record = [p_t]
    mean1_t_record = [mean1_t]
    mean2_t_record = [mean2_t]
    cov1_t_record = [cov1_t]
    cov2_t_record = [cov2_t]
    N = len(xy)
    for i in range(MAXITERATION):
            # N-Step:
        nx,ny = noise_gen(var_n/((i+1)*(i+1)), len(xy), mean1_t , mean2_t, xy)
        n = np.array([nx,ny]).T
        z = xy + n
            # E-Step:
        w = np.array([p_t*multivariate_normal.pdf(xy, mean = mean1_t, cov = cov1_t),
                    (1-p_t)*multivariate_normal.pdf(xy, mean = mean2_t, cov = cov2_t)])
        w = w/sum(w,0)
            # M-Step:
        nml = sum(w.T)
        p_t = nml[0]/N
        mean1_t = np.r_[sum(w[0,:]*z[:,0]), sum(w[0,:]*z[:,1])]
        mean1_t = mean1_t/nml[0]
        mean2_t = np.r_[sum(w[1,:]*z[:,0]), sum(w[1,:]*z[:,1])]
        mean2_t = mean2_t/nml[1]
        c1 = np.array([w[0],w[0]]).T*(z-mean1_t)
        c2 = np.array([w[1],w[1]]).T*(z-mean2_t)
        cov1_t = np.dot(c1.T,z-mean1_t)/nml[0]
        cov2_t = np.dot(c2.T,z-mean2_t)/nml[1]
        theta_t = np.r_[p_t, mean1_t, mean2_t, cov1_t.reshape(-1),cov2_t.reshape(-1)]
        #diff = sum(abs(theta_t - theta))
        diff = np.linalg.norm(theta_t - theta,2)
        theta = theta_t
        p_t_record.append(p_t)
        mean1_t_record.append(mean1_t)
        mean2_t_record.append(mean2_t)
        cov1_t_record.append(cov1_t)
        cov2_t_record.append(cov2_t)
        if i<10 or i%10 == 0:
            print i, diff
        if diff < tol:
            iternum = i+1
            break

    return iternum, p_t, mean1_t, mean2_t, cov1_t, cov2_t

In [9]:
def my_em_2d2pgmm(xy):
    # Initialize the parameters.
    model = KMeans(n_clusters=2)
    model.fit(xy)
    mean1_t = model.cluster_centers_[0]
    mean2_t = model.cluster_centers_[1]
    cov1_t = np.ma.cov(xy.T)
    cov2_t = cov1_t
    p_t = np.random.rand()
    # EM interation
    MAXITERATION = 10000
    tol = 0.001
    theta = np.r_[p_t, mean1_t, mean2_t, cov1_t.reshape(-1),cov2_t.reshape(-1)]
    iternum = 1
    p_t_record = [p_t]
    mean1_t_record = [mean1_t]
    mean2_t_record = [mean2_t]
    cov1_t_record = [cov1_t]
    cov2_t_record = [cov2_t]
    N = len(xy)
    for i in range(MAXITERATION):
            # E-Step:
        w = np.array([p_t*multivariate_normal.pdf(xy, mean = mean1_t, cov = cov1_t),
                    (1-p_t)*multivariate_normal.pdf(xy, mean = mean2_t, cov = cov2_t)])
        w = w/sum(w,0)
            # M-Step:
        nml = sum(w.T)
        p_t = nml[0]/N
        mean1_t = np.r_[sum(w[0,:]*xy[:,0]), sum(w[0,:]*xy[:,1])]
        mean1_t = mean1_t/nml[0]
        mean2_t = np.r_[sum(w[1,:]*xy[:,0]), sum(w[1,:]*xy[:,1])]
        mean2_t = mean2_t/nml[1]
        c1 = np.array([w[0],w[0]]).T*(xy-mean1_t)
        c2 = np.array([w[1],w[1]]).T*(xy-mean2_t)
        cov1_t = np.dot(c1.T,xy-mean1_t)/nml[0]
        cov2_t = np.dot(c2.T,xy-mean2_t)/nml[1]
        theta_t = np.r_[p_t, mean1_t, mean2_t, cov1_t.reshape(-1),cov2_t.reshape(-1)]
        #diff = sum(abs(theta_t - theta))
        diff = np.linalg.norm(theta_t - theta,2)
        theta = theta_t
        p_t_record.append(p_t)
        mean1_t_record.append(mean1_t)
        mean2_t_record.append(mean2_t)
        cov1_t_record.append(cov1_t)
        cov2_t_record.append(cov2_t)
        print i, diff
        if diff < tol:
            iternum = i+1
            break

    return iternum, p_t, mean1_t, mean2_t, cov1_t, cov2_t

In [14]:
iternum, p_t, mean1_t, mean2_t, cov1_t, cov2_t = my_nem_2d2pgmm(xy,0.01)
print iternum
print mean1_t
print mean2_t
iternum, p_t, mean1_t, mean2_t, cov1_t, cov2_t = my_em_2d2pgmm(xy)
print iternum
print mean1_t
print mean2_t

0 85.5856509841
1 29.6614932806
2 39.8462674137
3 38.5023890349
4 29.5771876863
5 1.84867787194
6 4.28820902628
7 4.81477116597
8 2.66050176496
9 3.72201559247
10 3.53990955461
20 0.175461338119
30 0.204511486093
40 0.107609023683
50 0.0492580397936
60 0.0341579148389
70 0.01108573028
80 0.00275627405981
90 0.0244883664729
100 0.0101651323272
110 0.0127296469141
120 0.0152242301172
130 0.00557488748446
140 0.00240058878703
150 0.00514983444262
160 0.00636257964179
170 0.0052800924231
180 0.00416109774502
190 0.00591390301922
200 0.00353964472114
210
[  2.03639492  54.47944415]
[  4.2896624   79.96812024]
0 120.758025518
1 54.2645481444
2 36.6208683503
3 3.3693924234
4 0.372064554027
5 0.0821598984835
6 0.0197253319952
7 0.00473814499294
8 0.00113998907021
9 0.000274422201209
10
[  2.03638896  54.47852144]
[  4.28966242  79.96812057]
