In [2]:
import numpy as np
from scipy.special import gammaln, digamma

In [3]:
import bnpy
reload(bnpy)
calcSummaryStats = bnpy.allocmodel.topics.HDPTopicModel.calcSummaryStats
calcXSummaryStats = bnpy.allocmodel.topics.HDPTopicModel.calcSummaryStats_expansion

In [4]:
alpha = 1.0

# Trying two split moves (one on topic 2, another on 3) into fixed set of states. 

We illustrate a split in four parts.

1. First, showing example local parameters for a sample current configuration.
2. Second, we show the proposed local parameters. *The construction method is left for later work, we only discuss the constraints the proposed parameters must abide relative to the originals.*
3. Third, we show how to (trivially) obtain the relevant sufficient statistics for the proposal, via direct calculation. This is easy and affordable for small datasets, but in batch-by-batch processing we cannot touch all documents at once.
4. Finally, we show how collecting batch-specific statistics and aggregating across batches, we can manipulate sufficient statistics to create whole-dataset proposal statistics **identical** to the direct method in step #3. 


### STEP 1: Original parameters

In [27]:
# Create dataset with 3 docs. word content doesnt matter, since we just focus on upper-level inference of topic probs
Data = bnpy.data.WordsData(vocab_size=1, word_id=np.zeros(10), word_count=np.ones(10), doc_range=np.asarray([0,1,2,3]))

In [28]:
curDoc_beta = np.asarray([
    [0.2, 0.2, 0.2], 
    [0.4, 0.1, 0.1],
    [0.1, 0.1, 0.1],
    ])

curDocTopicCount = np.asarray([
    [10, 20, 10], 
    [50, 30, 5], 
    [5, 10, 0],
    ])

In [29]:
curDoc_betaRem = 1 - curDoc_beta.sum(axis=1)
curTheta = curDocTopicCount + alpha * curDoc_beta
curThetaRem = alpha * curDoc_betaRem
curdigammaSumTheta = digamma(np.sum(curTheta + curThetaRem, axis=1))
curLP = dict(
    DocTopicCount=curDocTopicCount,
    theta=curTheta,
    thetaRem=curThetaRem,
    resp=np.random.rand(Data.nUniqueToken, curTheta.shape[1]),
    digammaSumTheta=curdigammaSumTheta,
    )

In [30]:
curSS = calcSummaryStats(Data, curLP, doPrecompEntropy=1, doTrackTruncationGrowth=1)

### STEP 2: Create proposal for expansion of topic 2 and topic 3

In [59]:
# Split of topic 2 into three new topics
xDoc_beta_2 = np.asarray([
    [0.1, 0.05, .05],
    [0.07, 0.01, .02],
    [0.02, 0.02, 0.06],
    ])
xDocTopicCount_2 = np.asarray([
    [10, 5, 5],
    [5, 5, 20],
    [3, 3, 4],
    ])
assert np.allclose(xDoc_beta_2.sum(axis=1), curDoc_beta[:, 1])
assert np.allclose(xDocTopicCount_2.sum(axis=1), curDocTopicCount[:, 1])

In [76]:
# Split of topic 3 into 3 new topics
xDoc_beta_3 = np.asarray([
    [0.1, 0.05, .05],     # sums to 0.2
    [0.07, 0.01, .02],    # sums to 0.1
    [0.02, 0.04, 0.04],   # sums to 0.1 = curBeta[doc=3, k=3]
    ])
xDocTopicCount_3 = np.asarray([
    [10, 0, 0],   # sums to 10 = curDocTopicCount[doc=1, k=3]
    [0, 3, 2],  # sums to 5
    [0, 0, 0],    # sums to 0
    ])
assert np.allclose(xDoc_beta_3.sum(axis=1), curDoc_beta[:, 2])
assert np.allclose(xDocTopicCount_3.sum(axis=1), curDocTopicCount[:, 2])


### Step 3: Create suff stats for for *combined* proposal, expanding topic 2 and topic 3 directly

In [79]:
keepCompIDs = [0]

propDocTopicCount = np.hstack([curDocTopicCount[:, keepCompIDs], xDocTopicCount_2, xDocTopicCount_3])
propDoc_beta = np.hstack([curDoc_beta[:, keepCompIDs], xDoc_beta_2, xDoc_beta_3])

assert np.allclose(propDocTopicCount.sum(axis=1),
                   curDocTopicCount.sum(axis=1))
assert np.allclose(propDoc_beta.sum(axis=1), 
                   curDoc_beta.sum(axis=1))

propK = propDoc_beta.shape[1]
propTheta = propDocTopicCount + alpha * propDoc_beta
propResp = np.random.rand(10, propK)
propLP = dict(
    DocTopicCount=propDocTopicCount,
    theta=propTheta,
    resp=propResp, 
    thetaRem=curThetaRem,
    digammaSumTheta=curLP['digammaSumTheta'])

In [80]:
propSS_directFromLP = calcSummaryStats(Data, propLP, doPrecompEntropy=1, doTrackTruncationGrowth=1)

### Step 4: Create suff stats for *combined* proposal, by stitching together stats for each individual move.

In [108]:
# Loop over all docs
for d in range(Data.nDoc):
    # Grab the single document
    Data_b = Data.select_subset_by_mask([d])
    # Do local and summary step using only expansion terms from state 2
    propLP_newonly_2b = dict(
        DocTopicCount=xDocTopicCount_2[d,:][np.newaxis,:],
        theta=xDocTopicCount_2[d,:][np.newaxis,:] + alpha * xDoc_beta_2[d,:][np.newaxis,:],
        resp=np.random.rand(1, xDocTopicCount_2.shape[1]), # doesnt matter
        thetaRem=curThetaRem[d],
        )
    propLP_newonly_2b['digammaSumTheta'] = np.asarray([curLP['digammaSumTheta'][d]])
    propSS_newonly_2b = calcXSummaryStats(Data_b, propLP_newonly_2b, uids=[200,201],
                                          doPrecompEntropy=1, doTrackTruncationGrowth=1)
    # Do local and summary step using only expansion terms from state 3
    propLP_newonly_3b = dict(
        DocTopicCount=xDocTopicCount_3[d,:][np.newaxis,:],
        theta=xDocTopicCount_3[d,:][np.newaxis,:] + alpha * xDoc_beta_3[d,:][np.newaxis,:],
        resp=np.random.rand(1, xDocTopicCount_3.shape[1]), # doesnt matter
        thetaRem=curThetaRem[d],
        )
    propLP_newonly_3b['digammaSumTheta'] = np.asarray([curLP['digammaSumTheta'][d]])
    propSS_newonly_3b = calcXSummaryStats(Data_b, propLP_newonly_3b, uids=[300,301],
                                          doPrecompEntropy=1, doTrackTruncationGrowth=1)
    # Aggregate the stats across this loop, so in the end all documents are represented.
    if d == 0:
        propSS_newonly_2 = propSS_newonly_2b.copy()
        propSS_newonly_3 = propSS_newonly_3b.copy()
    else:
        propSS_newonly_2 += propSS_newonly_2b
        propSS_newonly_3 += propSS_newonly_3b

In [103]:
# Now, we use suff stat manipulation to transform 
# from the current stats (K=3)
# into valid stats representing the proposal (K=4)
propSS_fromXSS = curSS.copy()
propSS_fromXSS.replaceCompWithExpansion(uid=1, xSS=propSS_newonly_2)
propSS_fromXSS.replaceCompWithExpansion(uid=2, xSS=propSS_newonly_3)

In [104]:
np.set_printoptions(linewidth=100, precision=2)
for key in ['sumLogPi', 'sumLogPiRemVec', 'gammalnTheta', 'gammalnSumTheta', 'slackTheta', 'slackThetaRem']:
    print key
    if hasattr(propSS_fromXSS, key):
        arr_directFromLP = getattr(propSS_directFromLP, key)
        arr_fromXSS = getattr(propSS_fromXSS, key)
        
    elif propSS_fromXSS.hasELBOTerm(key):
        arr_directFromLP = propSS_directFromLP.getELBOTerm(key)
        arr_fromXSS = propSS_fromXSS.getELBOTerm(key)
        
    print '  %s  direct construction proposal' % (arr_directFromLP)
    print '  %s  from tracked expansion stats' % (arr_fromXSS)
    assert np.allclose(arr_fromXSS, arr_directFromLP)

sumLogPi
  [ -3.18  -6.18  -6.93  -5.13 -73.93 -55.98 -56.47]  direct construction proposal
  [ -3.18  -6.18  -6.93  -5.13 -73.93 -55.98 -56.47]  from tracked expansion stats
sumLogPiRemVec
  [  0.     0.     0.     0.     0.     0.   -17.23]  direct construction proposal
  [  0.     0.     0.     0.     0.     0.   -17.23]  from tracked expansion stats
gammalnTheta
  [ 162.71   17.02    7.16   44.52   19.55    6.87    6.17]  direct construction proposal
  [ 162.71   17.02    7.16   44.52   19.55    6.87    6.17]  from tracked expansion stats
gammalnSumTheta
  433.986512449  direct construction proposal
  433.986512449  from tracked expansion stats
slackTheta
  [ 0.62  0.39  0.17  0.23  2.55  2.38  2.42]  direct construction proposal
  [ 0.62  0.39  0.17  0.23  2.55  2.38  2.42]  from tracked expansion stats
slackThetaRem
  8.08177323293  direct construction proposal
  8.08177323293  from tracked expansion stats
