# Example of GibbsLDA and vbLDA

This example requires to install three nltk corpora:nltk.corpus.reuters, nltk.corpus.words, nltk.corpus.stopwords.

You can download the corpora via `nltk.download()`

In [1]:
import logging

import numpy as np
from ptm import GibbsLDA
from ptm import vbLDA
from ptm.nltk_corpus import get_reuters_cnt_ids
from ptm.utils import convert_cnt_to_list, get_top_words

## Loading Reuter corpus from NLTK

Load reuter corpus including 1000 documents with maximum vocabulary size of 10000 from NLTK corpus

In [2]:
n_doc = 1000
voca, doc_ids, doc_cnt = get_reuters_cnt_ids(num_doc=n_doc, max_voca=10000)
docs = convert_cnt_to_list(doc_ids, doc_cnt)
n_voca = len(voca)
print('Vocabulary size:%d' % n_voca)

Vocabulary size:4632


## Inferencen through the Gibbs sampling

In [3]:
max_iter=100
n_topic=10

logger = logging.getLogger('GibbsLDA')
logger.propagate = False

model = GibbsLDA(n_doc, len(voca), n_topic)
model.fit(docs, max_iter=max_iter)

2016-02-10 12:25:36 INFO:GibbsLDA:[ITER] 0,	elapsed time:0.88,	log_-447691.97
2016-02-10 12:25:37 INFO:GibbsLDA:[ITER] 1,	elapsed time:0.87,	log_-422574.80
2016-02-10 12:25:38 INFO:GibbsLDA:[ITER] 2,	elapsed time:0.91,	log_-405916.54
2016-02-10 12:25:38 INFO:GibbsLDA:[ITER] 3,	elapsed time:0.90,	log_-394912.30
2016-02-10 12:25:39 INFO:GibbsLDA:[ITER] 4,	elapsed time:0.87,	log_-387023.43
2016-02-10 12:25:40 INFO:GibbsLDA:[ITER] 5,	elapsed time:0.86,	log_-381367.42
2016-02-10 12:25:41 INFO:GibbsLDA:[ITER] 6,	elapsed time:0.83,	log_-376728.70
2016-02-10 12:25:42 INFO:GibbsLDA:[ITER] 7,	elapsed time:0.86,	log_-373480.43
2016-02-10 12:25:43 INFO:GibbsLDA:[ITER] 8,	elapsed time:0.87,	log_-370704.34
2016-02-10 12:25:44 INFO:GibbsLDA:[ITER] 9,	elapsed time:0.86,	log_-368371.24
2016-02-10 12:25:45 INFO:GibbsLDA:[ITER] 10,	elapsed time:0.87,	log_-366379.43
2016-02-10 12:25:45 INFO:GibbsLDA:[ITER] 11,	elapsed time:0.85,	log_-364986.46
2016-02-10 12:25:46 INFO:GibbsLDA:[ITER] 12,	elapsed time:0.86

### Print top 10 probability words for each topic

In [4]:
for ti in range(n_topic):
    top_words = get_top_words(model.TW, voca, ti, n_words=10)
    print('Topic', ti ,':\t', ','.join(top_words))

Topic 0 :	 loss,quarter,first,profit,note,share,corp,earnings,tax,income
Topic 1 :	 rate,rise,march,week,last,days,fed,sugar,money,federal
Topic 2 :	 market,bank,canadian,investment,chairman,canada,assets,two,day,told
Topic 3 :	 stocks,last,wheat,grain,production,month,department,crop,dome,agriculture
Topic 4 :	 would,trade,could,growth,also,cut,european,west,one,market
Topic 5 :	 share,stock,offer,corp,price,april,common,cash,acquisition,new
Topic 6 :	 dollar,bank,yen,trade,economic,currency,exchange,current,around,would
Topic 7 :	 japan,may,record,meeting,japanese,april,agreement,market,pay,told
Topic 8 :	 one,gold,split,two,international,stock,corp,group,statement,insurance
Topic 9 :	 nil,oil,week,april,coffee,energy,previous,ended,total,demand


## Inferencen through the Variational Bayes

In [5]:
logger = logging.getLogger('vbLDA')
logger.propagate = False

vbmodel = vbLDA(n_doc, n_voca, n_topic)
vbmodel.fit(doc_ids, doc_cnt, max_iter=max_iter)

2016-02-10 12:27:02 INFO:vbLDA:[ITER] 0,	elapsed time:0.74,	ELBO:-478442.17
2016-02-10 12:27:03 INFO:vbLDA:[ITER] 1,	elapsed time:0.74,	ELBO:-424328.44
2016-02-10 12:27:03 INFO:vbLDA:[ITER] 2,	elapsed time:0.75,	ELBO:-380822.76
2016-02-10 12:27:04 INFO:vbLDA:[ITER] 3,	elapsed time:0.76,	ELBO:-364495.11
2016-02-10 12:27:05 INFO:vbLDA:[ITER] 4,	elapsed time:0.74,	ELBO:-357748.04
2016-02-10 12:27:06 INFO:vbLDA:[ITER] 5,	elapsed time:0.68,	ELBO:-354538.79
2016-02-10 12:27:06 INFO:vbLDA:[ITER] 6,	elapsed time:0.71,	ELBO:-352846.53
2016-02-10 12:27:07 INFO:vbLDA:[ITER] 7,	elapsed time:0.74,	ELBO:-351740.02
2016-02-10 12:27:08 INFO:vbLDA:[ITER] 8,	elapsed time:0.75,	ELBO:-351030.75
2016-02-10 12:27:09 INFO:vbLDA:[ITER] 9,	elapsed time:0.75,	ELBO:-350547.28
2016-02-10 12:27:09 INFO:vbLDA:[ITER] 10,	elapsed time:0.74,	ELBO:-350240.43
2016-02-10 12:27:10 INFO:vbLDA:[ITER] 11,	elapsed time:0.70,	ELBO:-350014.87
2016-02-10 12:27:11 INFO:vbLDA:[ITER] 12,	elapsed time:0.70,	ELBO:-349820.68
2016-02-1

### Print top 10 probability words for each topic

In [6]:
for ti in range(n_topic):
    top_words = get_top_words(vbmodel._lambda, voca, ti, n_words=10)
    print('Topic', ti ,':\t', ','.join(top_words))

Topic 0 :	 dollar,would,japan,market,trade,may,currency,monetary,meeting,bank
Topic 1 :	 week,fed,oil,would,gas,tax,last,one,april,two
Topic 2 :	 first,share,quarter,profit,earnings,corp,per,loss,new,assets
Topic 3 :	 corp,dome,stock,offer,would,debt,share,acquisition,business,borg
Topic 4 :	 bank,quarter,first,march,rate,market,days,money,rose,new
Topic 5 :	 april,one,stock,dividend,share,record,split,may,five,bank
Topic 6 :	 trade,japan,would,also,deficit,japanese,agreement,told,new,official
Topic 7 :	 last,month,crop,economic,surplus,trade,february,forecast,wheat,sugar
Topic 8 :	 nil,loss,stocks,production,total,end,use,start,profit,supply
Topic 9 :	 profit,share,price,two,per,coffee,corp,would,one,oil
