In [1]:
from gensim import corpora, models, similarities
from collections import defaultdict
from pprint import pprint

#This is recommended when using gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

---
Latent Dirichlet Allocation
=====
***

[The gensim tutorial](https://radimrehurek.com/gensim/tut1.html)

####From Wikipedia, Latent Dirichlet Allocation

1. Tell the alogorithm how many topics you think there are
 - intuitively
 - statistically

2. Assign every word to a topic in a semi-random manner (a dirichlet distribution)
 - a word can appear in more than one topic

3. Iterate: Loop through every word in each topic and update it's topic assignemnt, according to:

a. how prevalent is a word across topics, 

b. how prevalent are topics in the document

Looking at each topic what proportion of the topic is down to each word. Certain words will favor certain topics.

Looking at each document how prevalent are the topics. Divide up the document into the topics.

- I eat fish and vegetables
- Fish are pets
- My kitten eats fish

#####Ask for 2 topics:

Topic A: eat fish, eats fish, vegetables
    
Topic B: Fish, pets, kitten

#####Infer the content spread of each sentence by word count

- Sentence 1: 100% Topic A

- Sentence 2: 100% Topic B

- Sentence 3: 33% Topic B and 66% Topic A

#####Can derive the porportions that each word constitutes in given topics

- Topic A might comprise words in the following proportions: 40% eat, 40% fish, 20%vegetables

---
##Documents represented as strings
---

In [2]:
documents = ["Human machine computer interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

---
##Tokenize the documents, remove stop words and words that only appear once in the corpus
---

#####Firstly let's tokenize the documents, and remove stop words using a 'toy' stop-word list

In [3]:
stop_list = set(['for', 'a', 'of', 'the', 'and', 'to', 'in'])
print stop_list

set(['a', 'and', 'for', 'of', 'to', 'in', 'the'])


In [4]:
documents_without_stops = []
for docs in documents:
    t = [word for word in docs.lower().split() if word not in stop_list]
    documents_without_stops.append(t)

In [5]:
print documents_without_stops

[['human', 'machine', 'computer', 'interface', 'lab', 'abc', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']]


In [6]:
frequency = defaultdict(int)

In [7]:
for text in documents_without_stops:
    for token in text:
        frequency[token] += 1

In [8]:
texts = []
for text in documents_without_stops:
    t = [token for token in text if frequency[token] > 1]
    texts.append(t)

In [9]:
pprint(texts)

[['human', 'computer', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


#####Now convert documents to vectors, this is a bag-of-words representation

In [10]:
dictionary = corpora.Dictionary(texts)
print "dictionary is ", dictionary

dictionary is  Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


#####There are 12 distinct words, so each document will be represented by a 12-D vector

#####It is possible also to display the token id's that the words have been mapped to

In [11]:
print "Dictionary to token", (dictionary.token2id)

Dictionary to token {u'minors': 11, u'graph': 10, u'system': 5, u'trees': 9, u'eps': 8, u'computer': 0, u'survey': 4, u'user': 7, u'human': 1, u'time': 6, u'interface': 2, u'response': 3}


#####The function doc2bow is like the python CountVectorizer. It counts frequency of occurrence of words in each document and returns a spares matrix

In [12]:
for text in texts:
    print text
    print dictionary.doc2bow(text)
    
print "\n\n"
corpus = [dictionary.doc2bow(text) for text in texts]

#remember there are 12 tokens, and you need the dictionary to token information to work out the coding

for c in corpus:
    print c

['human', 'computer', 'interface', 'computer']
[(0, 2), (1, 1), (2, 1)]
['survey', 'user', 'computer', 'system', 'response', 'time']
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
['eps', 'user', 'interface', 'system']
[(2, 1), (5, 1), (7, 1), (8, 1)]
['system', 'human', 'system', 'eps']
[(1, 1), (5, 2), (8, 1)]
['user', 'response', 'time']
[(3, 1), (6, 1), (7, 1)]
['trees']
[(9, 1)]
['graph', 'trees']
[(9, 1), (10, 1)]
['graph', 'minors', 'trees']
[(9, 1), (10, 1), (11, 1)]
['graph', 'minors', 'survey']
[(4, 1), (10, 1), (11, 1)]



[(0, 2), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


#####The LDA model converts the bag-of-words representation into a topic-space of lower dimensionality
#####LDA's topics are probability distributions over words
#####The distributions are inferred automatically from the corpus
#####Documents are then interpretted as a mixture of these topics

In [13]:
lda_model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, passes = 10, iterations=1000)

In [14]:
for i, topic in enumerate(lda_model.print_topics(num_topics = 2, num_words = 2)):
    print "\n\ntopic {:d}:\n".format(i), topic



topic 0:
0.173*graph + 0.172*trees


topic 1:
0.161*computer + 0.159*user


---
##Classification to topic, with accompanying probability
---

In [15]:
new_doc = 'the grass is greener'
new_doc1 = 'Human Computer Interaction'
new_doc2 = 'Graphs are excellent data structures and are related to trees'
new_vec = dictionary.doc2bow(new_doc.lower().split())
new_vec1 = dictionary.doc2bow(new_doc1.lower().split())
new_vec2 = dictionary.doc2bow(new_doc2.lower().split())

#####Divides the documents up into topics

In [16]:
print lda_model[new_vec]
print lda_model[new_vec1]
print lda_model[new_vec2]

[(0, 0.5), (1, 0.5)]
[(0, 0.20654388799196441), (1, 0.79345611200803567)]
[(0, 0.74678800888043906), (1, 0.25321199111956094)]


In [17]:
help(lda_model)

Help on LdaModel in module gensim.models.ldamodel object:

class LdaModel(gensim.interfaces.TransformationABC)
 |  The constructor estimates Latent Dirichlet Allocation model parameters based
 |  on a training corpus:
 |  
 |  >>> lda = LdaModel(corpus, num_topics=10)
 |  
 |  You can then infer topic distributions on new, unseen documents, with
 |  
 |  >>> doc_lda = lda[doc_bow]
 |  
 |  The model can be updated (trained) with new documents via
 |  
 |  >>> lda.update(other_corpus)
 |  
 |  Model persistency is achieved through its `load`/`save` methods.
 |  
 |  Method resolution order:
 |      LdaModel
 |      gensim.interfaces.TransformationABC
 |      gensim.utils.SaveLoad
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, bow, eps=0.01)
 |      Return topic distribution for the given document `bow`, as a list of
 |      (topic_id, topic_probability) 2-tuples.
 |      
 |      Ignore topics with very low probability (below `eps`).
 |  
 |  __init