# Import PNAS test data set

In [1]:
import DAL
pnas = DAL.create('pnas')
testid = pnas.test_articles()

body = [pnas.byid(testid[i])['body'] for i in range(len(testid))]
title = [pnas.byid(testid[i])['title'] for i in range(len(testid))]

# Online Latent Dirichlet Allocation with wikipedia ariticles 

In [12]:


import cPickle, string, numpy, getopt, sys, random, time, re, pprint

import onlineldavb
import wikirandom

def main(body):
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 32
    # The total number of documents in Wikipedia
    D = 3e5
    # The number of topics
    K = 100
    
    # How many documents to look at
    documentstoanalyze = 100
 

    # Our vocabulary
    vocab = file('/tmp/pnasvocab', 'r').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('/tmp/lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('/tmp/gamma-%d.dat' % iteration, gamma)

    (gamma, sstats) = olda.do_e_step(body)
    rambda = olda._lambda
    
    return (gamma, rambda)
    
    
    
(gamma, rambda) = main(body)

downloaded 0/32 articles...
downloaded The_Milt_Grant_Show. parsing...
downloaded WMTB-FM. parsing...
downloaded Moribund. parsing...
downloaded Port_Albert,_New_Zealand. parsing...
downloaded Tetrallantos. parsing...
downloaded Carla_Guelfenbein. parsing...
downloaded 2008_Fed_Cup_Americas_Zone_Group_II_%E2%80%93_Pool_A. parsing...
downloaded 2010_TD54. parsing...
downloaded 8/32 articles...
downloaded Universal_Master_Control_Station. parsing...
downloaded Casso. parsing...
downloaded Holiest_sites_in_Islam. parsing...
downloaded Dobrzynka,_Warmian-Masurian_Voivodeship. parsing...
downloaded New_Chengdu_Airport. parsing...
downloaded Australia_at_the_1964_Summer_Paralympics. parsing...
downloaded Nils_Carls%C3%A9n. parsing...
downloaded Michael_Farenas. parsing...
downloaded 16/32 articles...
downloaded SMF. parsing...
downloaded Diego_Ferr%C3%ADn. parsing...
downloaded Edabiya. parsing...
downloaded 1970_San_Francisco_Giants_season. parsing...
downloaded Surf_Life_Saving_Australia. 

# Normalizing the gamma and compute the posterior mean 

In [14]:
import numpy as np
posterior = (gamma.T / (np.sum(gamma, axis = 1))).T

# Print out the first 10 most similar pair of articles

In [15]:
import numpy as np
import operator 
n = len(body)
sim_dict = {}
for i in range(n):
    x = posterior[i,:]
    for j in range(i+1,n):
        y = posterior[j,:]
        sim_value = 0
        for k in range(100):
            if (x[k] >= 1e-05 and y[k] >= 1e-05):
                sim_value += np.linalg.norm(np.log(x[k])-np.log(y[k])) 
        sim_key = title[i]+ " "+ "&" + " " + title[j]
        sim_dict.setdefault(sim_key, sim_value)       


sortsim = sorted(sim_dict.iteritems(), key=operator.itemgetter(1))
temp_dict = [list(t) for t in zip(*sortsim)]

print "I print out the first 10 most similar pair of articles:"
for i in range(10):
    print "The %s rank similar pair of articles' title would be " \
    %(i+1) + temp_dict[0][i]
    


I print out the first 10 most similar pair of articles:
The 1 rank similar pair of articles' title would be Retinotopic mapping of lateral geniculate nucleus in humans using functional magnetic resonance imaging & Molecular evidence for multiple origins of Insectivora and for a new order of endemic African insectivore mammals
The 2 rank similar pair of articles' title would be Retinotopic mapping of lateral geniculate nucleus in humans using functional magnetic resonance imaging & Memory&apos;s echo: Vivid remembering reactivates sensory-specific cortex
The 3 rank similar pair of articles' title would be Memory&apos;s echo: Vivid remembering reactivates sensory-specific cortex & Reactivation of encoding-related brain activity during memory retrieval
The 4 rank similar pair of articles' title would be Generation of cDNA expression libraries enriched for&thinsp;in-frame&thinsp;sequences & Reactivation of encoding-related brain activity during memory retrieval
The 5 rank similar pair of a

# From the results above, it is clear that most topically similar articles are in biology field.

# Most prominent words in each topics

In [17]:
import sys, os, re, random, math, urllib2, time, cPickle
import numpy

import onlineldavb

def main():
    """
    Displays topics fit by onlineldavb.py. The first column gives the
    (expected) most prominent words in the topics, the second column
    gives their (expected) relative prominence.
    """
    vocab = file('/tmp/pnasvocab', 'r').readlines()
    testlambda = rambda

    for k in range(0, len(testlambda)):
        lambdak = list(testlambda[k, :])
        lambdak = lambdak / sum(lambdak)
        temp = zip(lambdak, range(0, len(lambdak)))
        temp = sorted(temp, key = lambda x: x[0], reverse=True)
        print 'topic %d:' % (k)
        for i in range(0, 10):
            print '%20s  \t---\t  %.4f' % (vocab[temp[i][1]], temp[i][0])
        print

if __name__ == '__main__':
    main()

topic 0:
            femora 
  	---	  0.0031
          ClaphamD 
  	---	  0.0014
          CroteauR 
  	---	  0.0013
          upstream 
  	---	  0.0012
          suckling 
  	---	  0.0012
              CFSE 
  	---	  0.0011
        underwater 
  	---	  0.0010
            ChangB 
  	---	  0.0010
  chemosensitivity 
  	---	  0.0008
             ShenY 
  	---	  0.0008

topic 1:
        Biozentrum 
  	---	  0.0303
            femora 
  	---	  0.0282
       lymphotoxin 
  	---	  0.0235
          neoplasm 
  	---	  0.0157
              moon 
  	---	  0.0142
            silico 
  	---	  0.0141
             DEIRA 
  	---	  0.0129
         Disulfide 
  	---	  0.0128
              fire 
  	---	  0.0115
             ShenY 
  	---	  0.0103

topic 2:
            femora 
  	---	  0.1209
          suckling 
  	---	  0.0583
        underwater 
  	---	  0.0545
          neoplasm 
  	---	  0.0446
             ShenY 
  	---	  0.0361
         Disulfide 
  	---	  0.0185
         Predicted 
  	---	  0.0179

# From the outcomes above, the proportion of most 10 prominent words in each topic are quite small. Hence, the topics from these randomly selected wikipedia articles are weakly related to the PNAS dataset.  