## TOPIC MODELLING

LDA= Statistical model for discovering abstract collection of topics within a document
* Docs with similar topics use similar group of words
* Latent topics are doscovered by groups of words that occur together in docs

* LDA suggests that documents are probabilty distribution of related topics
* Topics are probabilty distribution of words

Doc1 = 0.4 Topic1+ ....

Doc2 = 0.3 Topic 2+


In [1]:
import gensim
import os
import collections
import smart_open
import random


## READ LEE CORPUS

In [2]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [18]:
'{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])

'/Users/deepak/anaconda/lib/python3.6/site-packages/gensim/test/test_data'

In [3]:
gensim.__path__[0]

'/Users/deepak/anaconda/lib/python3.6/site-packages/gensim'

In [4]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [5]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))


In [20]:
train_corpus[:1]  ## TAG = UNIQUE ID - 0,1,2....

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [7]:
print(test_corpus[:2])  ## NO TAG

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

In [21]:
## TRAINING THE DOC2VEC MODEL ON TRAIN CORPUS.

In [22]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [23]:
model.build_vocab(train_corpus)

In [24]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 5.74 s, sys: 233 ms, total: 5.97 s
Wall time: 3.42 s


2348196

In [25]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([ 0.07922315, -0.06632973,  0.09186766, -0.07410817, -0.02458601,
       -0.01648764,  0.01475526,  0.00538769, -0.0551978 , -0.07020958,
        0.1381208 , -0.02129775,  0.00366005, -0.0720093 ,  0.03157051,
        0.02103111,  0.08733623,  0.01193139, -0.05479198,  0.0523716 ,
       -0.07367259,  0.10764427,  0.01560509,  0.04638229,  0.03157345,
        0.02183392,  0.04597912, -0.02500262, -0.05768962, -0.05647735,
        0.01852136,  0.00293964, -0.0056356 ,  0.12171499, -0.10636989,
       -0.10650833,  0.09087385,  0.04298062,  0.00601824, -0.06252088,
       -0.04628289,  0.02344048,  0.12014269, -0.04814504, -0.03732086,
        0.03937462, -0.01960691,  0.00214821,  0.05465705,  0.02731813], dtype=float32)

In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) ## Most similar docs ranked
    rank = [docid for docid,sim in sims].index(doc_id) ## position of doc wrt most similar doc
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [29]:
collections.Counter(ranks)  # Results vary due to random seeding and very small corpus
ranks[0:10] # returning the rank of the document based on self-similarity.

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### MOST  MEDIAN AND LEAST SIMILAR DOC TO A GIVEN DOC

## ON TRAINING DATA

In [15]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

## On test Set

In [17]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus))
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (3): «radical armed islamist group with ties to tehran and baghdad has helped al qaida establish an international terrorist training camp in northern iraq kurdish officials say intelligence officers in the autonomous kurdish region of iraq told the guardian that the ansar al islam supporters of islam group is harbouring up to al qaida members in string of villages it controls along the iraq iran border most of them fled afghanistan after the us led offensive but officials from the patriotic union of kurdistan puk which controls part of north east iraq claim an abnormal number of recruits are making their way to the area from jordan syria and egypt»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (35, 0.6969833374023438): «spokesman for afghanistan defence ministry claims osama bin laden has fled to pakistan defence ministry spokesman mohamad habeel says the saudi born dissident is in hiding under the protection of supporters of radical isla