In [3]:
# import dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.io.json import json_normalize #package for flattening json in pandas df
#from fastai.imports import *
#from pandas_summary import DataFrameSummary
from IPython.display import display
from sklearn import metrics
import os
from gensim.models.coherencemodel import CoherenceModel
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sns.set_style("darkgrid")



In [4]:
TRAINDATAPATH = "PAN14/pan14_train_english-essays/"
TESTDATAPATH = "PAN14/pan14_test01_english-essays/"
FNAMES = ['known01','known02','known03','known04','known05', 'unknown']

def train_lda(corpus, dictionary, num_topics=20, passes=2, iterations=100, alpha=1e-2, eta=0.5e-2):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    #num_topics = 100
    #chunksize = 300
    
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, 
                   alpha=alpha, eta=eta, minimum_probability=0.0, passes=passes, iterations=iterations, random_state=1033, dtype=np.float64)
    t2 = time.time()
    print("Time to train LDA model on ", len(docs), "articles: ", (t2-t1)/60, "min")
    return lda

def data_to_topics(data, dictionary, cols):
    df = data.drop(columns=cols)
    for col in cols:
        df[col]=data[col].apply(lambda x: lda.get_document_topics(dictionary.doc2bow(x)) if x is not None else None)
    return df

In [5]:
def read_dataset(path):
    ds=pd.read_json(path+'/truth.json')
    ds=json_normalize(ds['problems'])
    ds['known01']=None
    ds['known02']=None
    ds['known03']=None
    ds['known04']=None
    ds['known05']=None
    ds['unknown']=None
    ds.set_index('name', drop=True, inplace=True)
    ds=ds[['known01','known02','known03','known04','known05', 'unknown', 'answer']]
    dirs = []
    docs = []

    for i, x in enumerate(os.walk(path)):
        if i:
            for fname in x[2]:
                with open(path+dirs[i-1]+'/'+fname, 'r') as f:
                    text = f.read()
                    doc = nltk.word_tokenize(text.strip())
                    docs.append(doc)
                    ds.loc[dirs[i-1],fname[:-4]]=doc
        else:
            dirs = x[1]

    return ds, docs

train, docs = read_dataset(TRAINDATAPATH)


In [8]:
train['unknown'].apply(lambda' '.join)

name
EE001    [﻿, In, the, name, of, Religion, ,, you, can, ...
EE004    [﻿, The, Decline, of, the, Birth-rate, in, Swe...
EE005    [﻿, ``, Taboo, or, Not, Taboo, '', In, the, ar...
EE008    [﻿, Granting, Homosexual, Couples, the, Right,...
EE013    [﻿, Politics, and, Education, On, numerous, oc...
EE014    [﻿, GIRLS, BEGIN, TO, DIET, AT, THE, AGE, OF, ...
EE018    [﻿, THE, RIGHT, SIDE, THERE, SHOULD, BE, MORE,...
EE021    [﻿, Stop, women, 's, right, to, an, abortion-,...
EE022    [﻿, Evaluation, -, My, English, I, think, that...
EE025    [﻿, THE, CAPITAL, PUNISHMENT, IS, NEVER, RIGHT...
EE027    [﻿, NEW, AGE, a, Trend, in, Our, Workplaces, ....
EE035    [﻿, THE, GROWTH, OF, NAZISM, IN, SOCIETY, Duri...
EE038    [﻿, Democracy, ,, a, delusion, ?, A, short, cr...
EE040    [﻿, When, Harriet, and, David, meet, they, kno...
EE043    [﻿Introduction, Geoffrey, Chaucer, is, one, of...
EE044    [﻿, DISCO, 'S, OUT, ,, MURDER, 'S, IN, ., I, h...
EE045    [﻿, Would, a, Spelling, Reform, of, the, E

In [4]:

test, _ = read_dataset(TESTDATAPATH)
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

2018-12-26 04:06:16,050 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-12-26 04:06:16,584 : INFO : built Dictionary(21051 unique tokens: ['\x03', '%', '&', "''", '(']...) from 729 documents (total 680137 corpus positions)


In [5]:
lda = train_lda(corpus, dictionary, num_topics=20, passes=2, iterations=100, alpha='auto', eta='auto')


2018-12-26 04:06:16,998 : INFO : using autotuned alpha, starting with [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
2018-12-26 04:06:17,005 : INFO : using serial LDA version on this node
2018-12-26 04:06:17,067 : INFO : running online (multi-pass) LDA training, 20 topics, 2 passes over the supplied corpus of 729 documents, updating model once every 729 documents, evaluating perplexity every 729 documents, iterating 100x with a convergence threshold of 0.001000
2018-12-26 04:06:22,768 : INFO : -11.521 per-word bound, 2938.7 perplexity estimate based on a held-out corpus of 729 documents with 680137 words
2018-12-26 04:06:22,768 : INFO : PROGRESS: pass 0, at document #729/729
2018-12-26 04:06:26,288 : INFO : optimized alpha [0.07924855819519203, 0.04150188029031236, 0.045717044971089686, 0.04001537330797816, 0.03320479442188855, 0.037591621771795085, 0.03258451971932087, 0.04787308347495848, 0.09883181446604969, 0

Time to train LDA model on  729 articles:  0.3049228628476461 min


In [6]:
topicsdf=data_to_topics(train, dictionary, FNAMES)
testtopicsdf=data_to_topics(test, dictionary, FNAMES)

In [15]:
def make_topics(topicdata, cols):
    df = topicdata.drop(columns=cols)
    ptopics=[]
    for col in cols:
        df[col]=topicdata[col].apply(lambda x: [pair[1] for pair in x] if x is not None else None)
        distributions=[t for t in df[col].tolist() if t is not None]
        ptopics.extend(distributions)
    return df, np.mean([item for sublist in ptopics for item in sublist])

realtopics_df, avg_topic_proba = make_topics(topicsdf, FNAMES)
testrealtopics_df,avg_topic_proba_test = make_topics(testtopicsdf, FNAMES)


0.05

In [21]:
def binarize_topics(realtopicdata, cols, avg):
    df = realtopicdata.drop(columns=cols)
    for col in cols:
        df[col]=realtopicdata[col].apply(lambda x: [1 if p > avg else 0 for p in x] if x is not None else None)
    return df

In [22]:
bintopics_df=binarize_topics(realtopics_df, FNAMES, avg_topic_proba)
testbintopics_df=binarize_topics(testrealtopics_df, FNAMES, avg_topic_proba_test)

In [28]:
bintopics_df.head()

Unnamed: 0_level_0,answer,known01,known02,known03,known04,known05,unknown
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
EE001,Y,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",,,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
EE004,Y,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
EE005,N,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",,,,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
EE008,Y,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
EE013,N,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [24]:
def subtract_vectors(binarized, knowncols):
    df = binarized.drop(columns=FNAMES)
    for i, col in enumerate(knowncols):
        df['diff' + str(i)] = binarized.apply(lambda row: np.abs(np.subtract(row[col], row.unknown)) if row[col] is not None else None, axis=1)
    return df

train_df = subtract_vectors(bintopics_df, ['known01','known02','known03','known04','known05'])
test_df = subtract_vectors(testbintopics_df, ['known01','known02','known03','known04','known05'])

In [25]:
def make_dataset(data, xcols):
    X = []
    y = []
    for col in xcols:
        xtemp=data[col].tolist()
        ytemp=data["answer"].tolist()
        for xx, yy in zip(xtemp, ytemp):
            if xx is not None:
                X.append(xx)
                if yy == "Y":
                    y.append(1)
                elif yy == "N":
                    y.append(0)
                else:
                    raise AttributeError
    assert(len(X) == len(y) and len(X) != 0)
    return X, y


X_train, y_train = make_dataset(train_df, ['diff0', 'diff1', 'diff2', 'diff3', 'diff4'])
X_test, y_test = make_dataset(test_df, ['diff0', 'diff1', 'diff2', 'diff3', 'diff4'])

In [26]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)


In [27]:
c=clf.score(X_test, y_test)
probas=clf.predict_proba(X_test)
auc=roc_auc_score(y_test, probas[:,1])
fs = c*auc
print('Results:\n c@1: ', c, '\n auc: ', auc, ' \n score(c@1*auc): ', fs)

Results:
 c@1:  0.6563706563706564 
 auc:  0.6638928656645191  
 score(c@1*auc):  0.43575979599601644


   
Results:
 c@1:  0.6563706563706564 
 auc:  0.6638928656645191  
 score(c@1*auc):  0.43575979599601644

# LDA

Latent Dirichlet Allocation, is an unsupervised generative model that assigns topic distributions to documents.

At a high level, the model assumes that each document will contain several topics, so that there is topic overlap within a document. The words in each document contribute to these topics. The topics may not be known a priori, and needn't even be specified, but the **number** of topics must be specified a priori. Finally, there can be words overlap between topics, so several topics may share the same words.

The model generates to **latent** (hidden) variables
1) A distribution over topics for each document
2) A distribution over words for each topics

After training, each document will have a discrete distribution over all topics, and each topic will have a discrete distribution over all words.

It is best to demonstrate this with an example. Let's say a document about the presidential elections may have a high contribution from the topics "presidential elections", "america", "voting" but have very low contributions from topics "himalayan mountain range", "video games", "machine learning" (assuming the corpus is varied enough to contain such articles); the topics "presidential elections" may have top contributing words ["vote","election","people","usa","clinton","trump",...] whereas the top contributing words in the topic "himalayan mountain range" may be ["nepal","everest","china","altitude","river","snow",....]. This very rough example should give you an idea of what LDA aims to do.

An important point to note: although I have named some topics in the example above, the model itself does not actually do any "naming" or classifying of topics. But by visually inspecting the top contributing words of a topic i.e. the discrete distribution over words for a topic, one can name the topics if necessary after training. We will show this more later.

There a several ways to implement LDA, however I will speak about collapsed gibbs sampling as I usually find this to be the easiest way to understand it.

The model initialises by assigning every word in every document to a **random** topic. Then, we iterate through each word, unassign it's current topic, decrement the topic count corpus wide and reassign the word to a new topic based on the local probability of topic assignemnts to the current document, and the global (corpus wide) probability of the word assignments to the current topic. This may be hard to understand in words, so the equations are below.

### The mathematics of collapsed gibbs sampling (cut back version)

Recall that when we iterate through each word in each document, we unassign its current topic assignment and reassign the word to a new topic. The topic we reassign the word to is based on the probabilities below.

$$
P\left(\text{document "likes" the topic}\right) \times P\left(\text{topic "likes" the word } w'\right)
$$

$$
\Rightarrow \frac{n_{i,k}+\alpha}{N_i-1+K\alpha} \times \frac{m_{w',k}+\gamma}{\sum_{w\in V}m_{w,k} + V\gamma}
$$

where

$n_{i,k}$ - number of word assignments to topic $k$ in document $i$

$n_{i,k}$ - number of assignments to topic $k$ in document $i$

$\alpha$ - smoothing parameter (hyper parameter - make sure probability is never 0)

$N_i$ - number of words in document $i$

$-1$ - don't count the current word you're on

$K$ - total number of topics


$m_{w',k}$ - number of assignments, corpus wide, of word $w'$ to topic $k$

$m_{w',k}$ - number of assignments, corpus wide, of word $w'$ to topic $k$

$\gamma$ - smoothing parameter (hyper parameter - make sure probability is never 0)

$\sum_{w\in V}m_{w,k}$ - sum over all words in vocabulary currently assigned to topic $k$

$V$ size of vocabulary i.e. number of distinct words corpus wide

### Notes and Uses of LDA

LDA has many uses; understanding the different varieties topics in a corpus (obviously), getting a better insight into the type of documents in a corpus (whether they are about news, wikipedia articles, business documents), quantifying the most used / most important words in a corpus, and even document similarity and recommendation.

LDA does not work well with very short documents, like twitter feeds, as explained here [[1]](https://pdfs.semanticscholar.org/f499/5dc2a4eb901594578e3780a6f33dee02dad1.pdf) [[2]](https://stackoverflow.com/questions/29786985/whats-the-disadvantage-of-lda-for-short-texts), which is why we dropped articles under 40 tokens previously. Very briefly, this is because the model infers parameters from observations and if there are not enough observations (words) in a document, the model performs poorly. For short texts, although yet to be rigoursly tested, it may be best to use a [biterm model](https://pdfs.semanticscholar.org/f499/5dc2a4eb901594578e3780a6f33dee02dad1.pdf).

Unlike the word2vec algorithm, which performs extremely well with full structured sentences, LDA is a bag of words model, meaning word order in a document doesnt count. This also means that stopwords and rare words should be excluded, so that the model doesnt overcompensate for very frequent words and very rare words, both of which do not contribute to general topics.

#### Hyperparameters

LDA has 2 hyperparameters: $\alpha$ and $\eta$

$\alpha$ - A low value for $\alpha$ means that documents have only a low number of topics contributing to them. A high value of $\alpha$ yields the inverse, meaning the documents appear more alike within a corpus.

$\eta$ - A low value for $\eta$ means the topics have a low number of contributing words. A high value of $\eta$ yields the inverse, meaning topics will have word overlap and appear more alike.

The values of $\alpha$ and $\eta$ really depend on the application, and may need to be tweaked several times before the desired results are found... even then, LDA is non-deterministic since parameters are randomly initialised, so the outcome of any run of the model can never be known in advance.

In [None]:
dictionary,corpus,lda = train_lda(train_df)

### Let's inspect some topics!

Bear in mind, when we see the words they may seem shortened. Recall this is because of our stemming function we previously implemented.

In [None]:
# show_topics method shows the the top num_words contributing to num_topics number of random topics
lda.show_topics(num_topics=10, num_words=20)

#### We can inspect individual topics as such

Note that if you re run the model again, as it is non-deterministic, word contributions to topics and topic ID's will change.

#### This topic is about court cases

In [None]:
lda.show_topic(topicid=4, topn=20)

#### This topic is about (supposedly) Illegal Immigration

In [None]:
lda.show_topic(topicid=85, topn=20)

#### This topic is about Religion

In [None]:
lda.show_topic(topicid=75, topn=20)

#### This topic is about Climate Change

In [None]:
lda.show_topic(topicid=39, topn=20)

What the about above means, is that topic 4 has top contributing words ["judge","case","court",...], which indicates the topic is about court cases. Topic 75 has top contributing words ["god","christian","love",...], which indicates the topic is about religion.

Now, not only can we see the word contribution for each topic, but we can also visualise the topic contribution for each article.

In [None]:
# select and article at random from train_df
random_article_index = np.random.randint(len(train_df))
bow = dictionary.doc2bow(train_df.iloc[random_article_index,7])
print(random_article_index)

In [None]:
print(train_df.iloc[random_article_index,3])

In [None]:
# get the topic contributions for the document chosen at random above
doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow)])

In [None]:
# bar plot of topic distribution for this document
fig, ax = plt.subplots(figsize=(12,6));
# the histogram of the data
patches = ax.bar(np.arange(len(doc_distribution)), doc_distribution)
ax.set_xlabel('Topic ID', fontsize=15)
ax.set_ylabel('Topic Contribution', fontsize=15)
ax.set_title("Topic Distribution for Article " + str(random_article_index), fontsize=20)
ax.set_xticks(np.linspace(10,100,10))
fig.tight_layout()
plt.show()

Ok, so clearly this document has various contributions from different topics. But what are these topics? Lets find out!

In [None]:
# print the top 5 contributing topics and their words
for i in doc_distribution.argsort()[-5:][::-1]:
    print(i, lda.show_topic(topicid=i, topn=10), "\n")

Let's interpret this.

Topic 9  - Protests

Topic 72 - Middl Eastern Countries

Topic 36 - Islam

Topic 55 - Power (socio political sense)

Topic 38 - Peoples actions

These are rough interpretations for these topics, most of which make sense. Reading the article we see the it is about riots in the Middle East. So the model seems to have worked well, at least in this one case.

# Similarity Queries and Unseen Data

We will now turn our attention to the test set of data which the model has not yet seen. Although the articles in *test_df* have been unseen by the model, gensim has a way of infering their topic distributions given the trained model. Of course, the correct approach to yield accurate results would be to retrain the model with these new articles part of the corpus, but this can be timely and infeasable in a real case scenario where results are needed quickly.

First, lets show how we can infer document topics for a new unseen article.

In [None]:
# select and article at random from test_df
random_article_index = np.random.randint(len(test_df))
print(random_article_index)

Here's the important bit. In obtaining the BOW representation for this unseen article, gensim cleverly only considers words in the existing dictionary we used to train the model. So if there are new words in this article, they will not be considered when infering the topic distribution. This is good in that no errors arise for unseen words, but bad in that some words may be cut out, and therefore we could miss out on an accurate topic distribution for this article.

However, we mitigate this risk because the training set is very much representative of the entire corpus; 99.9% of the observations are in the training set, with only 0.01% of observations in the test set. So most, if not all, words from the test set should be in the training set's dictionary.

In [None]:
new_bow = dictionary.doc2bow(test_df.iloc[random_article_index,7])

In [None]:
print(test_df.iloc[random_article_index,3])

In [None]:
new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])

Let's do the same visual analysis as before on this new unseen document

In [None]:
# bar plot of topic distribution for this document
fig, ax = plt.subplots(figsize=(12,6));
# the histogram of the data
patches = ax.bar(np.arange(len(new_doc_distribution)), new_doc_distribution)
ax.set_xlabel('Topic ID', fontsize=15)
ax.set_ylabel('Topic Contribution', fontsize=15)
ax.set_title("Topic Distribution for an Unseen Article", fontsize=20)
ax.set_xticks(np.linspace(10,100,10))
fig.tight_layout()
plt.show()

In [None]:
# print the top 8 contributing topics and their words
for i in new_doc_distribution.argsort()[-5:][::-1]:
    print(i, lda.show_topic(topicid=i, topn=10), "\n")

And there we have it! An accurate topic distribution for an unseen document.

### Similarity query

Ok, now that we have a topic distribution for a new unseen document, let's say we wanted to find the most similar documents in the corpus. We can do this by comparing the topic distribution of the new document to all the topic distributions of the documents in the corpus. We use the [Jensen-Shannon distance](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence) metric to find the most similar documents.

What the Jensen-Shannon distance tells us, is which documents are statisically "closer" (and therefore more similar), by comparing the divergence of their distributions. Jensen-Shannon is symmetric, unlike Kullback-Leibler on which the formula is based. This is good, because we want the similarity between documents A and B to be the same as the similarity between B and A.

The formula is described below.

For discrete distirbutions $P$ and $Q$, the Jensen-Shannon divergence, $JSD$ is defined as

$$JSD\left(P||Q\right) = \frac{1}{2}D\left(P||M\right)+\frac{1}{2}D\left(Q||M\right)$$

where $M = \frac{1}{2}\left(P+Q\right)$

and $D$ is the Kullback-Leibler divergence

$$D\left(P||Q\right) = \sum_iP(i)\log\left(\frac{P(i)}{Q(i)}\right)$$

$$\Rightarrow JSD\left(P||Q\right) = \frac{1}{2}\sum_i
\left[
P(i)\log\left(\frac{P(i)}{\frac{1}{2}\left(P(i)+Q(i)\right)}\right)
+
Q(i)\log\left(\frac{Q(i)}{\frac{1}{2}\left(P(i)+Q(i)\right)}\right)
\right]$$

The square root of the Jensen-Shannon divergence is the Jensen-Shannon Distance: $\sqrt{JSD\left ( P||Q\right )}$

**The smaller the Jensen-Shannon Distance, the more similar two distributions are (and in our case, the more similar any 2 documents are)**

We can use the scipy implementation of entropy to do this. Entropy calculates the KL divergence.

But first, we need to get all our LDA topic distributions into a dense matrix. This will enable fast and efficient computation.

We will create a dense matrix, **doc_topic_dist**, of size $M\times K$ where $M$ is the number of documents and $K$ is the number of topics.

In [None]:
# we need to use nested list comprehension here
# this may take 1-2 minutes...
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])
doc_topic_dist.shape

In [None]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

Let's compare the new unseen document, to the corpus, and see which articles are most similar.

In [None]:
def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

#### Query time + most similar documents... at last!

Ok, let's be 100% clear about what we are doing here.

We are comparing the new unseen document above to the entire corpus of ~10k documents to find which one is most similar to the new document.

How are we doing that? Well, we have the new documents LDA topic distribution in stored as varibale **new_doc_distribution**, and we have the entire corpus of documents topic distributions stored in the dense matrix **doc_topic_dist**. So now, we pass each row of **doc_topic_dist** through the Jensen-Shannon function above as the Q distribution, while the P distribution remains static as **new_doc_distribution**. Then we get the smallest distances and their corresponding index in the array, which we can pass to the **train_df** dataframe to print out the most similar documents.

In [None]:
# this is surprisingly fast
most_sim_ids = get_most_similar_documents(new_doc_distribution,doc_topic_dist)

In [None]:
most_similar_df = train_df[train_df.index.isin(most_sim_ids)]
most_similar_df['title']

I think we can see, the top most similar articles are quite similar indeed to the query article ;)

Our query article is about Trump, Huffington Post and the election. The top 10 most similar documents in the corpus also contain these topics, as their title show above. The reader can print out the full articles, or visualise the topic distributions for the most similar document and compare them to the query document to check the overlap.

## Conclusion

- After cleaning the corpus and keeping only the top 15,000 words, we reduced the unique words in the corpus by 84%
- The average document length is halved to 345 tokens after cleaning, compared to the raw version we saw in our explore notebook using word2vec
- The LDA algorithm was explained in detail
- The LDA model was able to accurately identify different topics in the fake news corpus. We visually inspected these topics to see that the top words were related
- We were able to infer a topic distribution from a new unseen document
- We quickly retrieved the most similar documents in the trained corpus when comparing to the new unseen document. These most similar documents were in fact closely related to the query document