In [11]:
import numpy as np
np.random.seed(123)
from scipy.stats import beta

def generate_dataset(num_points, true_k, dim=2, verbose=False):
    true_z = [i % true_k for i in range(num_points)]
    if False:
        probs_per_cluster = np.random.rand(dim, true_k)
    else:
        probs_per_cluster = beta.rvs(0.5, 0.5, size=(true_k, dim))    

    probs = probs_per_cluster[true_z, :]
    data = (probs > np.random.rand(num_points, dim)).astype(np.float32)
    return data

In [15]:
dim = 5
true_k = 3
num_points = 100
data = generate_dataset(100, 3, 5)

In [16]:
data.shape

(100, 5)

In [8]:
import numpy as np
import copy


class DPM:
    def __init__(self, initial_k, alpha, prior, data, z):
       
        self.num_clusters = initial_k
        self.num_samples, self.dim = data.shape
        self.alpha = alpha
        self.copy_of_prior = prior  # when initializing a new cluster, we copy this prior to start from
        self.data = data
        self.z = z
        self.N_k = [0]*initial_k  # Number of points in cluster k, like N_k in Murphy eq25.35 pg 888

        self.cluster_distros = []
        # Initialize the priors on the mixture components
        for _ in range(initial_k):
            self.cluster_distros.append(copy.deepcopy(prior))

        self.include_points(data, z)

    def include_points(self, data, z):
        for i, x in enumerate(data):
            k = z[i]
            self.cluster_distros[k].additem(x)
            self.N_k[k] += 1
        self.num_samples = sum(self.N_k)

    def step(self):
        for i, xx in enumerate(self.data):
            # -- 1 --
            k_old = self.z[i]
            self.N_k[k_old] -= 1
            self.cluster_distros[k_old].delitem(xx)
            self.remove_cluster_if_empty(k_old)
            pp = self.N_k.copy()
            pp.append(self.alpha)
            pp = np.log(np.array(pp))
            for k in range(self.num_clusters+1):
                pp[k] += self.logpredictive(k, xx)
            pp = np.exp(pp-np.max(pp))  # Subtract max to avoid numerical errors
            pp /= np.sum(pp)

            uu = np.random.rand()
            k_new = int(np.sum(uu > np.cumsum(pp)))

            # -- 3 --
            self.add_cluster_maybe(k_new)

            self.z[i] = k_new
            self.N_k[k_new] += 1
            self.cluster_distros[k_new].additem(xx)

    def add_cluster_maybe(self, k_new):
        if k_new == self.num_clusters:
            self.num_clusters += 1
            self.N_k.append(0)
            self.cluster_distros.append(copy.deepcopy(self.copy_of_prior))

    def logpredictive(self, k, xx):
        if not k == self.num_clusters:
            q = self.cluster_distros[k]
        else:
            q = copy.deepcopy(self.copy_of_prior)
        return q.logpred(xx)

    def remove_cluster_if_empty(self, k):
        if self.N_k[k] == 0:
            self.num_clusters -= 1
            self.cluster_distros.pop(k)
            self.N_k.pop(k)
            self.z[np.argwhere(self.z > k)] -= 1

    def print_probs(self):
        print('The MAP assignments of the clusters that we sampled')
        for i, k in enumerate(np.argsort(self.N_k)[::-1]):
            q = self.cluster_distros[k]
            map_assignment = q.get_posterior_multinoulli('map')
            print('Cluster %3i with %5i data and MAP %s' %
                  (k, q.num, ' - '.join(['%5.2f' % prob for prob in map_assignment])))


In [9]:
import numpy as np
from scipy.stats import beta


class Multinoulli:
    def __init__(self, dim, beta, gamma):
        self.dim = dim
        self.num = 0  # number of points currently assigned to the cluster
        self.counts = np.zeros(shape=(dim,))
        self.beta = beta
        self.gamma = gamma

    def logpred(self, xx):
        result = 0

        for i, xd in enumerate(xx):
            if xd > 0:
                result += np.log(self.beta + self.counts[i])
            else:
                result += np.log(self.gamma + (self.num - self.counts[i]))

        result -= self.dim*np.log(self.beta + self.gamma + self.num)
        return result

    def delitem(self, xx):
        self.num -= 1
        self.counts -= xx  # assumes xx in {0,1}

    def additem(self, xx):
        self.num += 1
        self.counts += xx

    def get_posterior_multinoulli(self, mode='sample'):
        assert mode in ['map', 'sample'], "expected mode 'map' or 'sample'"
        if mode == 'sample':
            probs = beta.rvs(self.counts + self.beta, self.num - self.counts + self.gamma)
        else:
            probs = (self.beta + self.counts)/(self.beta + self.gamma + self.num)
        return probs


In [18]:
q0 = Multinoulli(dim=dim, beta=1, gamma=1)

alpha = 5
initial_guess_K = 1
z = np.random.randint(0, initial_guess_K, (num_points,))

dpm = DPM(initial_guess_K, alpha, q0, data, z)

numstep = 100

burn_in = 10
sample_every = 3
num_clusters = []

In [21]:
for step in range(1, numstep):
    dpm.step()
    if step > burn_in:
        if step % sample_every == 0:
            num_clusters.append(len(dpm.N_k))

print('Average number of clusters %5.1f \n' % (np.mean(num_clusters)))
print(dpm.print_probs())

Average number of clusters  11.3 

The MAP assignments of the clusters that we sampled
Cluster   0 with    67 data and MAP  0.12 -  0.72 -  0.94 -  0.46 -  0.07
Cluster   1 with    13 data and MAP  0.80 -  0.07 -  0.73 -  0.07 -  0.40
Cluster   2 with     8 data and MAP  0.60 -  0.60 -  0.30 -  0.10 -  0.50
Cluster   5 with     6 data and MAP  0.75 -  0.12 -  0.12 -  0.50 -  0.25
Cluster   7 with     2 data and MAP  0.25 -  0.50 -  0.75 -  0.75 -  0.50
Cluster   4 with     2 data and MAP  0.25 -  0.25 -  0.75 -  0.50 -  0.50
Cluster   6 with     1 data and MAP  0.33 -  0.33 -  0.33 -  0.67 -  0.33
Cluster   3 with     1 data and MAP  0.67 -  0.67 -  0.33 -  0.33 -  0.33
None
