In [17]:
import cython
import numpy as np
from scipy.special import gammaln
import data_preproc
from data_preproc import data_preproc


# Table of Contents

This is broken down as
- Functions (missing)
- Timeit profiling
- Prun profiling (missing)
- Cython profiling

## Timeit Profiling

In [18]:
vocab, docs = data_preproc("tm_test_data.csv")


In [22]:
def hdp_eduardo():
    #############################
    #### Initialize params ######
    #############################

    # Hyper params
    beta = 0.5 # word concentration (LDA)
    alpha = np.random.gamma(1, 1) # GP hyperparam
    gamma = np.random.gamma(1, 1) # Base GP hyperparam

    V = vocab.shape[0] # length of vocabulary

    D = len(docs) # numb docs


    #### Storing structures

    # dictionary per doc j, 
    # has a 'jt_info' 3 x k_jt array where 
    # 1st row = table idx (using_t), 2nd row = topic idx (k_jt), 3rd row = table cnt (n_jt)
    # 
    # 'w_tbl_idx' is a vector storing word-table assignments (t_ji)
    # 
    # 'n_jtw' is cnt dict discretized by words within each topic-table (circle) 

    docs_dict = {j:{'jt_info':np.zeros((3,1), dtype=int), 
                   'w_tbl_idx': np.zeros(len(docs[j]), dtype=int) -1, 
                   'n_jtw':[{beta:beta}]} for j in range(D)}



    # A V+1 x k matrix, each row is a word so column sums gives us n_k
    n_kv = np.ones((V+1, 1)) * beta

    m_k = np.ones(1, dtype=int) # 1 x k matrix storing tables per topic

    k_idx = [0] # list storing topics

    x_ji = docs # data


    ############################################
    ######## Hierarchical Dirichlet ############
    ############################################

    for j, x_i in enumerate(x_ji):
        for i, w in enumerate(x_i):

        # Start of sampling T, get doc dict
        # get word table and topic assignments

            doc_j = docs_dict[j]
            t_idx = doc_j['w_tbl_idx'][i]
            k_jt = doc_j['jt_info'][1, t_idx]

            ### Remove word if assigned to table (i.e. -x_ji)
            if t_idx > 0: 
                assert k_jt > 0
                doc_j['n_jtw'][t_idx][w] -=1  # remove from dictionary table cnt n_jtk
                doc_j['jt_info'][2, t_idx] -= 1 # remove from table cnt n_jt
                n_kv[w, k_jt] -=1 # remove from topic count

                # if table is empty, remove table
                if doc_j['jt_info'][2, t_idx] == 0: 
                    doc_j['jt_info'] = np.delete(doc_j['jt_info'],t_idx, axis =1) # remove table, i.e. del column
                    m_k[k_jt] -= 1 

                    if m_k[k_jt] == 0: 
                        k_idx.remove(k_jt) #if no more tables with topic k, remove topic


            #### Sampling t ####

            # t if t is NOT NEW
            fk = n_kv[w,:] / n_kv.sum(axis=0) # compute f_k without x_ji
            n_jt = doc_j['jt_info'][2,:] # get counts across tables
            t_post = n_jt*fk[k_jt]

            # if t is NEW
            p_xji=0
            for k in range(len(k_idx)): # compute p_xji based on paper
                p_xji += m_k[k] * fk[k]

            p_xji = p_xji + (gamma / V) 
            t_post[0] = (alpha * p_xji)/ (sum(m_k) + gamma) # t if new store as first

            # normalize t-posterior distribution 
            t_post /=  t_post.sum() 

            # Sample from posterior, get index of most likely assignment and select table
            t_samp_idx = np.random.multinomial(1, t_post).argmax()
            new_t = doc_j['jt_info'][0, t_samp_idx]  

            ## New table is selected
            if new_t == 0:

                ### Sampling k when t is NEW ###
                kt_post = (m_k*fk)[k_idx] # existing topic
                kt_post[0] = gamma /V # new topic

                # Normalize posterior of k when t is NEW
                kt_post /= kt_post.sum()

                # Select most likely topic for new table
                kt_samp_idx = np.random.multinomial(1, kt_post).argmax()
                new_k = k_idx[kt_samp_idx]

                ## New topic selected
                if new_k == 0:

                    # Create new topic
                    for idx, k in enumerate(k_idx):
                        if idx != k: 
                            break
                        else:
                            new_k = len(k_idx)

                    # Append new topic to list of topics, add column to word-topic matrix, extend table-topic array
                    k_idx.append(new_k)
                    n_kv = np.c_[n_kv, np.ones((V+1, 1)) * beta]
                    m_k = np.r_[m_k, 0]
                    assert new_k == k_idx[-1]
                    assert new_k < n_kv.shape[1]


                # Add table to table-topic array
                m_k[new_k] += 1

                # Create new table
                new_t = doc_j['jt_info'].shape[1]

                # Add column to doc's 'jt_info' array, set topic of new table,extend discretized word cnt dict
                # to allocate word in new table
                doc_j['jt_info'] = np.c_[doc_j['jt_info'], np.zeros((3,1), dtype=int)]
                doc_j['jt_info'][1, new_t] = new_k
                doc_j['n_jtw'].append({w:0})


            # Get topic of table (either new or old)
            new_k = doc_j['jt_info'][1, new_t]

            # Seat at table, assign corresponding topic, add 1 to overall table count
            doc_j['jt_info'][:2, new_t] = np.array([new_t , new_k])
            doc_j['jt_info'][2, new_t] += 1

            # Get word-table assignment idx, add 1 to discretized word cnt dictionary for that table
            doc_j['w_tbl_idx'][i] = new_t
            doc_j['n_jtw'][new_t][w] = doc_j['n_jtw'][new_t].get(w, 0) + 1

            # Add 1 for word in word-topic count matrix
            n_kv[w, new_k] += 1

        # Store document inferences 
        docs_dict[j] = doc_j




    for j in range(D):
        doc_j = docs_dict[j]
        for tbl in doc_j['jt_info'][0, :]:

            #### START of Sampling k loop through tables, (skip first index always, 0 = dummy idx) ####
            if tbl != 0: 

                # Get topic k, remove all components from table t associated with topic k
                k_jt = doc_j['jt_info'][1, tbl]
                m_k[k_jt] -= 1 # remove from table-topic vector

                if m_k[k_jt] == 0:
                    k_idx.remove(k_jt) # if no more tables with topic k, remove topic k and set table's topic to 0
                    doc_j['jt_info'][1, tbl] = 0


                ###### SAMPLING K ######

                # Topic k of table t
                k_jt = doc_j['jt_info'][1, tbl]

                # Remove all counts associated with topic k in table t, from overall topic counts (n_k)
                n_k = n_kv.sum(axis = 0)
                n_jt = doc_j['jt_info'][2, tbl]
                n_k[k_jt] -= n_jt
                n_k = n_k[k_idx]

                # Initialized k posterior in log-form for simplicity, this computes f_k^{-X_ji} 
                # has Dirichlet-Multinomial form
                log_post_k = np.log(m_k[k_idx]) + gammaln(n_k) - gammaln(n_k + n_jt)
                log_post_k_new = np.log(gamma) + gammaln(V*beta) - gammaln((V*beta) + n_jt)

                # Remove individual word counts associated with topic k
                # add their contributions to k posterior
                for w_key, w_cnt in doc_j['n_jtw'][tbl].items():

                    assert w_cnt >= 0
                    if w_cnt == 0: # if word count is 0 skip
                        continue

                    # For word w, get counts across topics - if zero set as beta
                    w_cnt_k = n_kv[w_key, :]
                    w_cnt_k[w_cnt_k == 0] = beta

                    # For specific topic k, remove count from associated table t
                    w_cnt_k[k_jt] -= w_cnt
                    w_cnt_k = w_cnt_k[k_idx]

                    w_cnt_k[0] = 1
                    #if np.any(w_cnt_k <= 0): print("check",j, tbl, k_jt, w_key, w_cnt_k)

                    # Add contributions
                    log_post_k += gammaln(w_cnt_k  + w_cnt) - gammaln(w_cnt_k)
                    log_post_k_new += gammaln(beta + w_cnt) - gammaln(beta)


                # set K new
                log_post_k[0] = log_post_k_new

                # Bring back to non-log realm, normalize k-posterior 
                post_k = np.exp(log_post_k - log_post_k.max())
                post_k /= post_k.sum() 


                # Select most likely topic for table
                k_samp_idx = np.random.multinomial(1, post_k).argmax()
                new_k = k_idx[k_samp_idx]


                ## New topic selected
                if new_k == 0:

                    # Create new topic
                    for idx, k in enumerate(k_idx):
                        if idx != k: 
                            break
                        else:
                            new_k = len(k_idx)

                    # Append new topic to list of topics, add column to word-topic matrix, extend table-topic array
                    k_idx.append(new_k)
                    n_kv = np.c_[n_kv, np.ones((V+1, 1)) * beta]
                    m_k = np.r_[m_k, 0]
                    assert new_k == k_idx[-1]
                    assert new_k < n_kv.shape[1]


                # Add table to topic k count
                m_k[new_k] += 1

                # If new topic for table t is selected, set topic to new topic
                k_jt = doc_j['jt_info'][1, tbl]
                if new_k != k_jt: 
                    doc_j['jt_info'][1, tbl] = new_k

                    # On word-topic matrix, move counts from old topic to new topic
                    for k, cnt in doc_j['n_jtw'][tbl].items():
                        if k_jt != 0: 
                            n_kv[k, k_jt] -= cnt

                        n_kv[k, new_k] += cnt

        # Store final document inferences
        docs_dict[j] = doc_j




In [23]:
%timeit -r2 hdp_eduardo()

3.51 s ± 1.01 s per loop (mean ± std. dev. of 2 runs, 1 loop each)


## Cython profiling

In [7]:
%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [9]:
%%cython -a

import numpy as np
from scipy.special import gammaln
import data_preproc
from data_preproc import data_preproc

vocab, docs = data_preproc("tm_test_data.csv")

#############################
#### Initialize params ######
#############################

# Hyper params
beta = 0.5 # word concentration (LDA)
alpha = np.random.gamma(1, 1) # GP hyperparam
gamma = np.random.gamma(1, 1) # Base GP hyperparam

V = vocab.shape[0] # length of vocabulary

D = len(docs) # numb docs


#### Storing structures

# dictionary per doc j, 
# has a 'jt_info' 3 x k_jt array where 
# 1st row = table idx (using_t), 2nd row = topic idx (k_jt), 3rd row = table cnt (n_jt)
# 
# 'w_tbl_idx' is a vector storing word-table assignments (t_ji)
# 
# 'n_jtw' is cnt dict discretized by words within each topic-table (circle) 

docs_dict = {j:{'jt_info':np.zeros((3,1), dtype=int), 
               'w_tbl_idx': np.zeros(len(docs[j]), dtype=int) -1, 
               'n_jtw':[{beta:beta}]} for j in range(D)}



# A V+1 x k matrix, each row is a word so column sums gives us n_k
n_kv = np.ones((V+1, 1)) * beta

m_k = np.ones(1, dtype=int) # 1 x k matrix storing tables per topic

k_idx = [0] # list storing topics

x_ji = docs # data

############################################
######## Hierarchical Dirichlet ############
############################################

for j, x_i in enumerate(x_ji):
    for i, w in enumerate(x_i):
    
    # Start of sampling T, get doc dict
    # get word table and topic assignments
        
        doc_j = docs_dict[j]
        t_idx = doc_j['w_tbl_idx'][i]
        k_jt = doc_j['jt_info'][1, t_idx]

        ### Remove word if assigned to table (i.e. -x_ji)
        if t_idx > 0: 
            assert k_jt > 0
            doc_j['n_jtw'][t_idx][w] -=1  # remove from dictionary table cnt n_jtk
            doc_j['jt_info'][2, t_idx] -= 1 # remove from table cnt n_jt
            n_kv[w, k_jt] -=1 # remove from topic count

            # if table is empty, remove table
            if doc_j['jt_info'][2, t_idx] == 0: 
                doc_j['jt_info'] = np.delete(doc_j['jt_info'],t_idx, axis =1) # remove table, i.e. del column
                m_k[k_jt] -= 1 

                if m_k[k_jt] == 0: 
                    k_idx.remove(k_jt) #if no more tables with topic k, remove topic


        #### Sampling t ####

        # t if t is NOT NEW
        fk = n_kv[w,:] / n_kv.sum(axis=0) # compute f_k without x_ji
        n_jt = doc_j['jt_info'][2,:] # get counts across tables
        t_post = n_jt*fk[k_jt]

        # if t is NEW
        p_xji=0
        for k in range(len(k_idx)): # compute p_xji based on paper
            p_xji += m_k[k] * fk[k]
            
        p_xji = p_xji + (gamma / V) 
        t_post[0] = (alpha * p_xji)/ (sum(m_k) + gamma) # t if new store as first

        # normalize t-posterior distribution 
        t_post /=  t_post.sum() 
        
        # Sample from posterior, get index of most likely assignment and select table
        t_samp_idx = np.random.multinomial(1, t_post).argmax()
        new_t = doc_j['jt_info'][0, t_samp_idx]  

        ## New table is selected
        if new_t == 0:

            ### Sampling k when t is NEW ###
            kt_post = (m_k*fk)[k_idx] # existing topic
            kt_post[0] = gamma /V # new topic
            
            # Normalize posterior of k when t is NEW
            kt_post /= kt_post.sum()
            
            # Select most likely topic for new table
            kt_samp_idx = np.random.multinomial(1, kt_post).argmax()
            new_k = k_idx[kt_samp_idx]

            ## New topic selected
            if new_k == 0:
                
                # Create new topic
                for idx, k in enumerate(k_idx):
                    if idx != k: 
                        break
                    else:
                        new_k = len(k_idx)
                
                # Append new topic to list of topics, add column to word-topic matrix, extend table-topic array
                k_idx.append(new_k)
                n_kv = np.c_[n_kv, np.ones((V+1, 1)) * beta]
                m_k = np.r_[m_k, 0]
                assert new_k == k_idx[-1]
                assert new_k < n_kv.shape[1]


            # Add table to table-topic array
            m_k[new_k] += 1
            
            # Create new table
            new_t = doc_j['jt_info'].shape[1]
            
            # Add column to doc's 'jt_info' array, set topic of new table,extend discretized word cnt dict
            # to allocate word in new table
            doc_j['jt_info'] = np.c_[doc_j['jt_info'], np.zeros((3,1), dtype=int)]
            doc_j['jt_info'][1, new_t] = new_k
            doc_j['n_jtw'].append({w:0})


        # Get topic of table (either new or old)
        new_k = doc_j['jt_info'][1, new_t]
        
        # Seat at table, assign corresponding topic, add 1 to overall table count
        doc_j['jt_info'][:2, new_t] = np.array([new_t , new_k])
        doc_j['jt_info'][2, new_t] += 1
        
        # Get word-table assignment idx, add 1 to discretized word cnt dictionary for that table
        doc_j['w_tbl_idx'][i] = new_t
        doc_j['n_jtw'][new_t][w] = doc_j['n_jtw'][new_t].get(w, 0) + 1
        
        # Add 1 for word in word-topic count matrix
        n_kv[w, new_k] += 1

    # Store document inferences 
    docs_dict[j] = doc_j




for j in range(D):
    doc_j = docs_dict[j]
    for tbl in doc_j['jt_info'][0, :]:

        #### START of Sampling k loop through tables, (skip first index always, 0 = dummy idx) ####
        if tbl != 0: 

            # Get topic k, remove all components from table t associated with topic k
            k_jt = doc_j['jt_info'][1, tbl]
            m_k[k_jt] -= 1 # remove from table-topic vector

            if m_k[k_jt] == 0:
                k_idx.remove(k_jt) # if no more tables with topic k, remove topic k and set table's topic to 0
                doc_j['jt_info'][1, tbl] = 0


            ###### SAMPLING K ######
            
            # Topic k of table t
            k_jt = doc_j['jt_info'][1, tbl]
            
            # Remove all counts associated with topic k in table t, from overall topic counts (n_k)
            n_k = n_kv.sum(axis = 0)
            n_jt = doc_j['jt_info'][2, tbl]
            n_k[k_jt] -= n_jt
            n_k = n_k[k_idx]
            
            # Initialized k posterior in log-form for simplicity, this computes f_k^{-X_ji} 
            # has Dirichlet-Multinomial form
            log_post_k = np.log(m_k[k_idx]) + gammaln(n_k) - gammaln(n_k + n_jt)
            log_post_k_new = np.log(gamma) + gammaln(V*beta) - gammaln((V*beta) + n_jt)

            # Remove individual word counts associated with topic k
            # add their contributions to k posterior
            for w_key, w_cnt in doc_j['n_jtw'][tbl].items():

                assert w_cnt >= 0
                if w_cnt == 0: # if word count is 0 skip
                    continue
                
                # For word w, get counts across topics - if zero set as beta
                w_cnt_k = n_kv[w_key, :]
                w_cnt_k[w_cnt_k == 0] = beta
                
                # For specific topic k, remove count from associated table t
                w_cnt_k[k_jt] -= w_cnt
                w_cnt_k = w_cnt_k[k_idx]

                w_cnt_k[0] = 1
                #if np.any(w_cnt_k <= 0): print("check",j, tbl, k_jt, w_key, w_cnt_k)
                
                # Add contributions
                log_post_k += gammaln(w_cnt_k  + w_cnt) - gammaln(w_cnt_k)
                log_post_k_new += gammaln(beta + w_cnt) - gammaln(beta)

            
            # set K new
            log_post_k[0] = log_post_k_new
            
            # Bring back to non-log realm, normalize k-posterior 
            post_k = np.exp(log_post_k - log_post_k.max())
            post_k /= post_k.sum() 
            
            
            # Select most likely topic for table
            k_samp_idx = np.random.multinomial(1, post_k).argmax()
            new_k = k_idx[k_samp_idx]

            
            ## New topic selected
            if new_k == 0:
                
                # Create new topic
                for idx, k in enumerate(k_idx):
                    if idx != k: 
                        break
                    else:
                        new_k = len(k_idx)
                
                # Append new topic to list of topics, add column to word-topic matrix, extend table-topic array
                k_idx.append(new_k)
                n_kv = np.c_[n_kv, np.ones((V+1, 1)) * beta]
                m_k = np.r_[m_k, 0]
                assert new_k == k_idx[-1]
                assert new_k < n_kv.shape[1]


            # Add table to topic k count
            m_k[new_k] += 1

            # If new topic for table t is selected, set topic to new topic
            k_jt = doc_j['jt_info'][1, tbl]
            if new_k != k_jt: 
                doc_j['jt_info'][1, tbl] = new_k

                # On word-topic matrix, move counts from old topic to new topic
                for k, cnt in doc_j['n_jtw'][tbl].items():
                    if k_jt != 0: 
                        n_kv[k, k_jt] -= cnt

                    n_kv[k, new_k] += cnt
                    
    # Store final document inferences
    docs_dict[j] = doc_j




