In [3]:
import numpy as np
from scipy.special import gammaln, digamma

In [4]:
import bnpy
reload(bnpy)
calcSummaryStats = bnpy.allocmodel.topics.HDPTopicModel.calcSummaryStats
calcXSummaryStats = bnpy.allocmodel.topics.HDPTopicModel.calcSummaryStats_expansion

In [5]:
alpha = 1.0

# Split of single state into fixed set of states. 

We illustrate a split in four parts.

1. First, showing example local parameters for a sample current configuration.
2. Second, we show the proposed local parameters. *The construction method is left for later work, we only discuss the constraints the proposed parameters must abide relative to the originals.*
3. Third, we show how to (trivially) obtain the relevant sufficient statistics for the proposal, via direct calculation. This is easy and affordable for small datasets, but in batch-by-batch processing we cannot touch all documents at once.
4. Finally, we show how collecting batch-specific statistics and aggregating across batches, we can manipulate sufficient statistics to create whole-dataset proposal statistics **identical** to the direct method in step #3. 


### STEP 1: Current local parameters.

First, we consider 3 existing states and 3 total documents. We imagine each document lives in its own batch, with its own value of beta, to simulate the fact that the entire vector beta may change from batch-to-batch.

In [6]:
DocTopicCount = np.asarray([
    [10, 20, 10], 
    [50, 30, 5], 
    [5, 10, 0],
    ])
Doc_beta = np.asarray([
    [0.2, 0.2, 0.2], 
    [0.4, 0.1, 0.1],
    [0.1, 0.1, 0.1],
    ])

In [7]:
Data = bnpy.data.WordsData(vocab_size=1, word_id=np.zeros(10), word_count=np.ones(10), doc_range=np.asarray([0,1,2,3]))
theta = DocTopicCount + alpha * Doc_beta
resp = np.random.rand(10,theta.shape[1])
curLP = dict(DocTopicCount=DocTopicCount, theta=theta, resp=resp, thetaRem=alpha * (1-Doc_beta.sum(axis=1)))

In [8]:
# Finally, compute suff stats for the current set of components (before any expansion moves attempted)
curSS = calcSummaryStats(Data, curLP, doPrecompEntropy=1)

### Step 2: Proposed local parameters

Now, consider a split that divides *topic 1* into 2 topics, denoted "1a" and "1b". This yields one possible proposal:

*Again, the important thing is not the specific values here, but the fact that we can illustrate how to create valid parameters theta for all documents that exactly represent a specific beta vector (that sums to one) and a specific set of assignments (for every word in the doc).*

In [9]:
alpha = 1.0
Doc_beta = np.asarray([
    [0.1, 0.1, 0.2, 0.2], 
    [0.3, 0.1, 0.1, 0.1],
    [0.09, .01, 0.1, 0.1],
    ])
DocTopicCount = np.asarray([
    [5, 5, 20, 10],
    [40, 10, 30, 5],
    [0, 5, 10, 0],
    ])

In [10]:
Data = bnpy.data.WordsData(vocab_size=1, word_id=np.zeros(10), word_count=np.ones(10), doc_range=np.asarray([0,1,2,3]))
theta = DocTopicCount + alpha * Doc_beta
resp = np.random.rand(10,theta.shape[1])
propLP = dict(
    DocTopicCount=DocTopicCount,
    theta=theta,
    resp=resp, 
    thetaRem=alpha * (1-Doc_beta.sum(axis=1)))

### STEP 3: Constructing proposal statistics directly from the local parameters

We can compute all the relevant sufficient statistics about the current and proposed value of theta.

This is done compactly by an HDPTopicModel-specific function in bnpy.

In [11]:
propSS_directFromLP = calcSummaryStats(Data, propLP, doPrecompEntropy=1)

In [12]:
# We can inspect a few fields, just to understand

# sumLogPi : aggregate log probability of each topic
print propSS_directFromLP.sumLogPi

[-17.22825291  -5.59231043  -2.26010056 -17.51404028]


### STEP 4: Constructing proposal statistics via aggregating and manipulating batch-specific stats

Now, instead, we'll imagine that we created the proposal one batch at a time. Instead of tracking all the proposed thetas, at each document we just track summary statistics for the *expanded/split* states, that is, topics $1a$ and $1b$

In [13]:
# Loop over all docs
for d in range(Data.nDoc):
    # Grab the single document
    Data_b = Data.select_subset_by_mask([d])
    # Create the local params just for this single document
    propLP_newonly_b = dict(
        DocTopicCount=DocTopicCount[d, :2][np.newaxis,:],
        theta=theta[d, :2][np.newaxis,:],
        resp=resp[d, :2][np.newaxis,:],
        thetaRem=0,
        )
    propLP_newonly_b['digammaSumTheta'] = np.asarray([propLP['digammaSumTheta'][d]])
    
    # Create EXPANSION-ONLY statistics, which are useful for describing only the newly proposed states
    propSS_newonly_b = calcXSummaryStats(Data_b, propLP_newonly_b, doPrecompEntropy=1, uids=[101, 102])
    
    # Aggregate the stats across this loop, so in the end all documents are represented.
    if d == 0:
        propSS_newonly = propSS_newonly_b.copy()
    else:
        propSS_newonly += propSS_newonly_b

In [14]:
# Now, we use suff stat manipulation to transform 
# from the current stats (K=3)
# into valid stats representing the proposal (K=4)
propSS_fromXSS = curSS.copy()
propSS_fromXSS.replaceCompWithExpansion(uid=0, xSS=propSS_newonly)
# This expanded proposal always places the new comps last in order, so let's shuffle our original proposal that way too
propSS_directFromLP.reorderComps([2,3,0,1])

In [15]:
for key in ['sumLogPi', 'sumLogPiRem', 'gammalnTheta', 'gammalnSumTheta', 'slackTheta', 'slackThetaRem']:
    print key
    if hasattr(propSS_fromXSS, key):
        arr_directFromLP = getattr(propSS_directFromLP, key)
        arr_fromXSS = getattr(propSS_fromXSS, key)
        
    elif propSS_fromXSS.hasELBOTerm(key):
        arr_directFromLP = propSS_directFromLP.getELBOTerm(key)
        arr_fromXSS = propSS_fromXSS.getELBOTerm(key)
        
    print '  %s  direct construction proposal' % (arr_directFromLP)
    print '  %s  from tracked expansion stats' % (arr_fromXSS)
    assert np.allclose(arr_fromXSS, arr_directFromLP)

sumLogPi
  [ -2.26010056 -17.51404028 -17.22825291  -5.59231043]  direct construction proposal
  [ -2.26010056 -17.51404028 -17.22825291  -5.59231043]  from tracked expansion stats
sumLogPiRem
  -17.2336554208  direct construction proposal
  -17.2336554208  from tracked expansion stats
gammalnTheta
  [ 124.55818972   18.83674356  113.42794164   19.55041697]  direct construction proposal
  [ 124.55818972   18.83674356  113.42794164   19.55041697]  from tracked expansion stats
gammalnSumTheta
  433.986512449  direct construction proposal
  433.986512449  from tracked expansion stats
slackTheta
  [ 0.2980702   1.89427998  1.732848    0.44828941]  direct construction proposal
  [ 0.2980702   1.89427998  1.732848    0.44828941]  from tracked expansion stats
slackThetaRem
  8.08177323293  direct construction proposal
  8.08177323293  from tracked expansion stats
