In [1]:
import numpy as np
from scipy.special import gammaln, digamma

np.set_printoptions(formatter=dict(all=lambda s: '% 7.3f' % (s)))

In [2]:
import bnpy
reload(bnpy)
calcSummaryStats = bnpy.allocmodel.topics.HDPTopicModel.calcSummaryStats
calcXSummaryStats = bnpy.allocmodel.topics.HDPTopicModel.calcSummaryStats_expansion

# Split of single state into a growing set of states. 

We illustrate a growing-split in four parts.

1. First, showing example local parameters for a hypothetical current configuration.
2. Second, we show the proposed local parameters for several batches. *The construction method is left for later work, we only care that the proposed parameters obey special constraints relative to the originals.*
3. Third, we show how to (trivially) obtain the relevant sufficient statistics for the proposal, via direct calculation. This is easy and affordable for small datasets, but in batch-by-batch processing we cannot touch all documents at once.
4. Finally, we show how collecting batch-specific statistics and aggregating across batches, we can manipulate sufficient statistics to create whole-dataset proposal statistics **identical** to the direct method in step #3. 


### STEP 1: Current local parameters.

First, we consider 3 existing states and 3 total documents. We imagine each document lives in its own batch, with its own value of beta, to simulate the fact that the entire vector beta may change from batch-to-batch.

In [3]:
curDocTopicCount = np.asarray([
    [10, 20, 10], 
    [50, 30, 5], 
    [5, 10, 0],
    ])
curDoc_beta = np.asarray([
    [0.2, 0.2, 0.2], 
    [0.4, 0.1, 0.1],
    [0.1, 0.1, 0.1],
    ])
alpha = 1.0

In [4]:
Data = bnpy.data.WordsData(vocab_size=1, word_id=np.zeros(10), word_count=np.ones(10), doc_range=np.asarray([0,1,2,3]))
theta = curDocTopicCount + alpha * curDoc_beta
resp = np.random.rand(10,theta.shape[1])
curLP = dict(DocTopicCount=curDocTopicCount, theta=theta, resp=resp, thetaRem=alpha * (1-curDoc_beta.sum(axis=1)))

In [5]:
# Finally, compute suff stats for the current set of components (before any expansion moves attempted)
curSS = calcSummaryStats(Data, curLP, doPrecompEntropy=1, doTrackTruncationGrowth=1)

### Step 2: Proposed local parameters

Now, consider a split that acts on topic 1. We may assign its mass to two topics, "1a" and "1b", in the first batch, then add an additional topic "1c" in the 2nd batch, and a final topic in the third batch.

*Again, the important thing is not the specific values here, but the fact that we can illustrate how to create valid parameters theta for all documents that exactly represent a specific beta vector (that sums to one) and a specific set of assignments (for every word in the doc).*

In [6]:
alpha = 1.0
Doc_beta = np.asarray([
    [0.2,  0.2,  0.15,  0.05,  0,    0],
    [0.1,  0.1,  0.1,   0.1,   0.2,  0],
    [0.1,  0.1,  .01,   .02,   .03,  .04]
    ])
DocTopicCount = np.asarray([
    [20, 10, 5, 5,  0, 0],
    [30, 5, 25, 5, 20, 0],
    [10, 0, 0,  0, 1,  4],
    ])

In [7]:
Data = bnpy.data.WordsData(vocab_size=1, word_id=np.zeros(10), word_count=np.ones(10), doc_range=np.asarray([0,1,2,3]))
theta = DocTopicCount + alpha * Doc_beta
resp = np.random.rand(10,theta.shape[1])
thetaRem=alpha * (1-Doc_beta.sum(axis=1))
K_d = np.asarray([4, 5, 6])

In [8]:
# Verify that Doc_beta and curDoc_beta have same sums
print Doc_beta.sum(axis=1)
print curDoc_beta.sum(axis=1)

print DocTopicCount.sum(axis=1)
print curDocTopicCount.sum(axis=1)

[  0.600   0.600   0.300]
[  0.600   0.600   0.300]
[ 40.000  85.000  15.000]
[ 40.000  85.000  15.000]


### STEP 3: Constructing proposal statistics directly from the local parameters

We can compute all the relevant sufficient statistics about the current and proposed value of theta.

This is done compactly by an HDPTopicModel-specific function in bnpy.

In [9]:
# Loop over all docs
for d in range(Data.nDoc):
    # Grab the single document
    Data_b = Data.select_subset_by_mask([d])
    
    # Create the local params just for this single document
    propLP_b = dict(
        DocTopicCount=DocTopicCount[d, :K_d[d]][np.newaxis,:],
        theta=theta[d, :K_d[d]][np.newaxis,:],
        resp=resp[d, :K_d[d]][np.newaxis,:],
        thetaRem=thetaRem[d],
        )
    propSS_b = calcSummaryStats(Data_b, propLP_b, doPrecompEntropy=1, doTrackTruncationGrowth=1)
    
    # Aggregate the stats across this loop, so in the end all documents are represented.
    if d == 0:
        propSS = propSS_b.copy()
    else:
        Kextra = propSS_b.K - propSS.K
        if Kextra > 0:
            propSS.insertEmptyComps(Kextra)
        propSS += propSS_b
propSS_directFromLP = propSS

In [10]:
# We can inspect a few fields, just to understand

# sumLogPi : aggregate log probability of each topic
print propSS_directFromLP.sumLogPi

# sumLogPiRemVec : aggregate "remainder topic" probabilities
print propSS_directFromLP.sumLogPiRemVec

[ -2.260 -17.514 -106.710 -58.391  -4.738  -1.474]
[  0.000   0.000   0.000  -6.263  -7.010  -3.961]


sumLogPiRemVec is the "remainder" mass for topics larger than the assigned truncation level.

When we allow the truncation to grow at each batch, we need to track the fact that we had X leftover mass after K=4 at document 1, then Y leftover mass after K=5 at document 2, etc.

So, we keep a vector of size K, where entry k is the aggregate total (across all documents) of the leftover mass beyond topic k for any document with truncation K=k.

### STEP 4: Constructing proposal statistics via aggregating and manipulating batch-specific stats

Now, instead, we'll imagine that we created the proposal one batch at a time. Instead of tracking all the proposed thetas, at each document we just track summary statistics for the *expanded/split* states, that is, topics $1a$ and $1b$

In [11]:
# Loop over all docs
for d in range(Data.nDoc):
    # Grab the single document
    Data_b = Data.select_subset_by_mask([d])
    
    # Create the local params just for this single document
    digammaSumTheta_b = np.asarray([digamma(theta[d, :K_d[d]].sum() + thetaRem[d])])
    propLP_b_newonly = dict(
        DocTopicCount=DocTopicCount[d, 2:K_d[d]][np.newaxis,:],
        theta=theta[d, 2:K_d[d]][np.newaxis,:],
        resp=resp[d, 2:K_d[d]][np.newaxis,:],
        thetaRem=thetaRem[d],
        digammaSumTheta=digammaSumTheta_b,        
        )
    propSS_b_newonly = calcXSummaryStats(
        Data_b, propLP_b_newonly,
        doPrecompEntropy=1, doTrackTruncationGrowth=1,
        uids=np.arange(3, K_d[d]))
    
    # Aggregate the stats across this loop, so in the end all documents are represented.
    if d == 0:
        propSS_newonly = propSS_b_newonly.copy()
    else:
        Kextra = propSS_b_newonly.K - propSS_newonly.K
        if Kextra > 0:
            propSS_newonly.insertEmptyComps(Kextra)
        propSS_newonly += propSS_b_newonly

In [13]:
# Now, we use suff stat manipulation to transform 
# from the current stats (K=3)
# into valid stats representing the proposal (K=4)
propSS_fromXSS = curSS.copy()
propSS_fromXSS.replaceCompWithExpansion(uid=0, xSS=propSS_newonly)

In [14]:
for key in ['sumLogPi', 'sumLogPiRemVec', 'gammalnTheta', 'gammalnSumTheta', 'slackTheta', 'slackThetaRem']:
    print key
    if hasattr(propSS_fromXSS, key):
        arr_directFromLP = getattr(propSS_directFromLP, key)
        arr_fromXSS = getattr(propSS_fromXSS, key)
        
    elif propSS_fromXSS.hasELBOTerm(key):
        arr_directFromLP = propSS_directFromLP.getELBOTerm(key)
        arr_fromXSS = propSS_fromXSS.getELBOTerm(key)
        
    print '  %s  direct construction proposal' % (arr_directFromLP)
    print '  %s  from tracked expansion stats' % (arr_fromXSS)
    assert np.allclose(arr_fromXSS, arr_directFromLP)

sumLogPi
  [ -2.260 -17.514 -106.710 -58.391  -4.738  -1.474]  direct construction proposal
  [ -2.260 -17.514 -106.710 -58.391  -4.738  -1.474]  from tracked expansion stats
sumLogPiRemVec
  [  0.000   0.000   0.000  -6.263  -7.010  -3.961]  direct construction proposal
  [  0.000   0.000   0.000  -6.263  -7.010  -3.961]  from tracked expansion stats
gammalnTheta
  [ 124.558  18.837  63.111  10.484  39.918   1.842]  direct construction proposal
  [ 124.558  18.837  63.111  10.484  39.918   1.842]  from tracked expansion stats
gammalnSumTheta
  433.986512449  direct construction proposal
  433.986512449  from tracked expansion stats
slackTheta
  [  0.298   1.894   1.482   1.467   0.392   0.059]  direct construction proposal
  [  0.298   1.894   1.482   1.467   0.392   0.059]  from tracked expansion stats
slackThetaRem
  8.08177323293  direct construction proposal
  8.08177323293  from tracked expansion stats
