# Gaussian Mixture Model examples

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.metrics import confusion_matrix
import os,sys,time
sys.path.insert(1, '../../src/')
import madmix
import gibbs
from concrete import *
import aux

plt.rcParams.update({'figure.max_open_warning': 0})
plt.rcParams["figure.figsize"]=15,7.5
plt.rcParams.update({'font.size': 40})
fig_path='fig/'
fig_path='../../../madmix-tex/fig/'

## Old Faithful

Example 1 of 3 (not included in manuscript).

In [None]:
####################
####################
#  data wrangling  #
####################
####################
of_dat=pd.read_table('https://gist.githubusercontent.com/curran/4b59d1046d9e66f2787780ad51a1cd87/raw/9ec906b78a98cf300947a37b56cfe70d01183200/data.tsv')
dat=np.array(of_dat)
K=2

In [None]:
of_dat.plot.scatter('eruptions','waiting')

### Gibbs sampler

Now we run a Gibbs sampler for 1,000 steps after burn-in 
(which will consist of 90% of the total steps taken,
i.e., we run the sampler for 10,000 steps and take 
the last 1,000 as the sample).

In [None]:
####################
####################
#      setup       #
####################
####################

# settings
steps=1000
burnin_pct=0.9
D=dat.shape[1]

# initial arrays
mu0=np.array([[2,50],[5,80]])
sigma0=np.zeros((K,D,D))
for k in range(K): sigma0[k,:,:]=5.*np.eye(D)
w0=np.ones(K)/K

In [None]:
####################
####################
#   run sampler    #
####################
####################
xs,ws,mus,sigmas=gibbs.gibbs_gmm(y=np.array(of_dat),mu0=mu0,sigma0=sigma0,w0=w0,steps=steps,burnin_pct=burnin_pct)

In [None]:
####################
####################
#     results      #
####################
####################
plt.scatter(dat[:,0],dat[:,1],c=xs[-1,:])
plt.plot(np.mean(mus,axis=0)[:,0],np.mean(mus,axis=0)[:,1],'*r',ms=10)

xx, yy = np.mgrid[1:6:.1, 35:100:.1]
data = np.dstack((xx, yy))
for k in range(K):
    rv = stats.multivariate_normal(np.mean(mus,axis=0)[k,:], np.mean(sigmas,axis=0)[k,:,:])
    zz = rv.pdf(data)
    plt.contour(xx, yy, zz,levels=4,colors='grey')

    
plt.xlabel('Eruptions')
plt.ylabel('Wait time')

## Palmer penguin data set

In [None]:
from palmerpenguins import load_penguins
penguins = load_penguins().dropna()
std_penguins=(penguins-penguins.mean())/penguins.std() # normalize data
K=3

In [None]:
colors=np.squeeze(np.array(penguins[['species']]))
colors[colors=='Adelie']='#7ad151'
colors[colors=='Gentoo']='#2a788e'
colors[colors=='Chinstrap']='#440154'
pd.plotting.scatter_matrix(std_penguins[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']],c=colors);

In [None]:
pg_dat=np.array(std_penguins[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']])

### Gibbs sampling

In [None]:
####################
####################
#      setup       #
####################
####################

# settings
steps=1000
burnin_pct=0.9
D=pg_dat.shape[1]

# initial arrays
#mu0=np.array([[35.,25.,175.,3500.],  # green
#              [55.,17.,200.,4000.],  # purple 
#              [45.,10.,225.,5000.]]) # blue
mu0=np.array([[-2.,1.,-1.,-1.],  # green
             [1.,1.,-0.5,-0.5],  # purple
             [1.,-1.5,1.5,2.]])  # blue
sigma0=np.zeros((K,D,D))
for k in range(K): sigma0[k,:,:]=0.5*np.eye(D)
w0=np.ones(K)/K

In [None]:
####################
####################
#   run sampler    #
####################
####################
seed=1
xs,ws,mus,sigmas=gibbs.gibbs_gmm(y=pg_dat,mu0=mu0,sigma0=sigma0,w0=w0,steps=steps,burnin_pct=burnin_pct,seed=seed)

In [None]:
####################
####################
#    save data     #
####################
####################
aux.pkl_save(xs,'sockeye_run/penguin/pred_x')
aux.pkl_save(ws,'sockeye_run/penguin/pred_w')
aux.pkl_save(mus,'sockeye_run/penguin/pred_mu')
aux.pkl_save(sigmas,'sockeye_run/penguin/pred_sigma')

In [None]:
colors[xs[-1,:]==0]='#440154'
colors[xs[-1,:]==1]='#7ad151'
colors[xs[-1,:]==2]='#2a788e'
pd.plotting.scatter_matrix(penguins[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']],c=colors);

### Concrete 

In [None]:
####################
####################
#    load data     #
####################
####################
path='sockeye/penguin/cache/'
files=os.listdir(path)
losses_files=[file[:-4] for file in files if 'losses' in file] # only get name without extension
flows_files=[file[:-4] for file in files if 'flows' in file]

losses=np.zeros((len(losses_files),10001))
flows=[None for i in range(len(losses_files))]
for i in range(len(losses_files)): 
    losses[i]=aux.pkl_load(path+losses_files[i])
    flows[i]=aux.pkl_load(path+flows_files[i])
# end for

In [None]:
# loss traceplot
threshold=10000
plt_losses=np.copy(losses)
plt_losses[plt_losses>threshold]=threshold
for i in range(len(losses_files)): plt.plot(np.arange(10001),plt_losses[i,:],c='black',alpha=0.8)
plt.xlabel('Iteration #')
plt.ylabel('Loss')

In [None]:
# retrieve best flow
i_star=np.argmin(losses[:,-1])
best_flow=flows[i_star]
print('Optimal flow: '+str(i_star))

In [None]:
# generate sample from optimal flow
torch.manual_seed(1)
flow_sample=best_flow.sample(10000)

In [None]:
# loss 124 is smallest
# corresponds to: temp: 5, depth: 64, width: 100, lr: 1e-5
# here we train it manually
N,K,D=xs.shape[1],mus.shape[1],mus.shape[2]
tau0=0.1

temp = 5.
depth = 64
width = 100

lr = 1e-5
max_iters=10001

In [None]:
# sample generation
xs_concrete=torch.from_numpy(xs)
ws_concrete=torch.from_numpy(ws)
mus_concrete=torch.from_numpy(mus)
sigmas_concrete=torch.from_numpy(sigmas)

conc_sample=gmm_concrete_sample(xs_concrete,ws_concrete,mus_concrete,sigmas_concrete,temp)

In [None]:
tmp_flow_penguin,tmp_loss_penguin=trainGMMRealNVP(
    temp=temp,depth=depth,N=N,K=K,D=D,tau0=tau0,sample=conc_sample,width=width,max_iters=max_iters,lr=lr,seed=2023,verbose=True
)

## Waveform data set

In [None]:
from sklearn.decomposition import PCA
waveform_dat=pd.read_table('https://hastie.su.domains/ElemStatLearn/datasets/waveform.train')
pca = PCA(n_components=4)
pca.fit(waveform_dat[waveform_dat.columns.difference(['row.names','y'])])
waveform_pca=np.array(waveform_dat[waveform_dat.columns.difference(['row.names','y'])])@pca.components_.T

In [None]:
plt.scatter(waveform_pca[:,0],waveform_pca[:,1],c=np.squeeze(np.array(waveform_dat[['y']])))

In [None]:
wf_dat=waveform_pca[:,:2]
K=3

### Gibbs sampling

In [None]:
####################
####################
#      setup       #
####################
####################

# settings
steps=1000
burnin_pct=0.9
D=wf_dat.shape[1]

# initial arrays
mu0=np.array([[-3.,4.],  # blue
              [ 5.,4.],  # purple 
              [ 0.,0.]]) # yellow
sigma0=np.zeros((K,D,D))
for k in range(K): sigma0[k,:,:]=5.*np.eye(D)
w0=np.ones(K)/K

In [None]:
####################
####################
#   run sampler    #
####################
####################
seed=1
xs,ws,mus,sigmas=gibbs.gibbs_gmm(y=wf_dat,mu0=mu0,sigma0=sigma0,w0=w0,steps=steps,burnin_pct=burnin_pct,seed=seed)

In [None]:
####################
####################
#    save data     #
####################
####################
aux.pkl_save(xs,'sockeye_run/waveform/pred_x')
aux.pkl_save(ws,'sockeye_run/waveform/pred_w')
aux.pkl_save(mus,'sockeye_run/waveform/pred_mu')
aux.pkl_save(sigmas,'sockeye_run/waveform/pred_sigma')

In [None]:
plt.scatter(waveform_pca[:,0],waveform_pca[:,1],c=np.mean(xs,axis=0))
plt.plot(np.mean(mus,axis=0)[:,0],np.mean(mus,axis=0)[:,1],'*r',ms=10)


xx, yy = np.mgrid[-10:15:.1, -5:10:.1]
data = np.dstack((xx, yy))
for k in range(K):
    rv = stats.multivariate_normal(np.mean(mus,axis=0)[k,:], np.mean(sigmas,axis=0)[k,:,:])
    zz = rv.pdf(data)
    plt.contour(xx, yy, zz,levels=4,colors='grey')

### Concrete

In [None]:
####################
####################
#    load data     #
####################
####################
path='sockeye/waveform/cache/'
files=os.listdir(path)
losses_files=[file[:-4] for file in files if 'losses' in file] # only get name without extension
flows_files=[file[:-4] for file in files if 'flows' in file]

losses=np.zeros((len(losses_files),10001))
flows=[None for i in range(len(losses_files))]
for i in range(len(losses_files)): 
    losses[i]=aux.pkl_load(path+losses_files[i])
    flows[i]=aux.pkl_load(path+flows_files[i])
# end for

In [None]:
# loss traceplot
threshold=10000
plt_losses=np.copy(losses)
plt_losses[plt_losses>threshold]=threshold
for i in range(len(losses_files)): plt.plot(np.arange(10001),plt_losses[i,:],c='black',alpha=0.8)
plt.xlabel('Iteration #')
plt.ylabel('Loss')

In [None]:
# retrieve best flow
i_star=np.argmin(losses[:,-1])
best_flow=flows[i_star]
print('Optimal flow: '+str(i_star))

In [None]:
# generate sample from optimal flow
torch.manual_seed(1)
flow_sample=best_flow.sample(10000)

In [None]:
# loss 122 is smallest
# corresponds to: temp: 5, depth: 64, width: 50, lr: 1e-03
# here we train it manually
N,K,D=xs.shape[1],mus.shape[1],mus.shape[2]
tau0=0.1

temp = 5.
depth = 64
width = 50

lr = 1e-3
max_iters=10001

In [None]:
# sample generation
xs_concrete=torch.from_numpy(xs)
ws_concrete=torch.from_numpy(ws)
mus_concrete=torch.from_numpy(mus)
sigmas_concrete=torch.from_numpy(sigmas)

conc_sample=gmm_concrete_sample(xs_concrete,ws_concrete,mus_concrete,sigmas_concrete,temp)

In [None]:
tmp_flow_waveform,tmp_loss_waveform=trainGMMRealNVP(
    temp=temp,depth=depth,N=N,K=K,D=D,tau0=tau0,sample=conc_sample,width=width,max_iters=max_iters,lr=lr,seed=2023,verbose=True
)