In [29]:
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.models import LsiModel
from gensim.models import TfidfModel
import numpy
%matplotlib inline

In [52]:
# setting up corpus and documents we will be comparing
texts = [['banco','mar','curtar','agua'],
        ['mar','surfear','cortar','agua'],
        ['mar','agua','cangrejo','lento','arboles'],
        ['banco','agua','invierno','suelo'],
        ['banco','banca','agua','llover','mar'],
        ['mar','agua','nubes','arboles'],
        ['dinero','transacciones','banco','finanzas','prestamos'],
        ['banco','prestar','dinero'],
        ['banco','financiero'],
        ["interfaz","humanos","laboratorios","Madrid"],
        ["final", "real madrid","fracaso"],
        ["humanos","tecnologia","Toledo"],
        ["politica","presupuestos","cancelacion","acuerdos"],
        ['financiacion','dinero','venta','banco'],
        ['marron','vender'],
        ['banco','prestar','vender']]

In [53]:
# Create Dictionary
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

### Creating TF-IDF and LDA models for the following corpus will help us illustrate our distance metrics

In [54]:
tfidf = TfidfModel(corpus)
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=3)
#model = LsiModel(corpus, id2word=dictionary, num_topics=5)

### Representation of TF-IDF would have as many features as the size of the vocabulary, and an LDA model representation would have as many features as the number of topics. We will be using both these models later to compare distances

In [55]:
model.show_topics()

[(0,
  '0.094*"humanos" + 0.061*"banco" + 0.057*"dinero" + 0.054*"venta" + 0.054*"Toledo" + 0.054*"laboratorios" + 0.054*"interfaz" + 0.054*"financiacion" + 0.054*"Madrid" + 0.054*"tecnologia"'),
 (1,
  '0.133*"agua" + 0.110*"mar" + 0.090*"banco" + 0.052*"arboles" + 0.051*"vender" + 0.030*"prestar" + 0.030*"suelo" + 0.030*"lento" + 0.029*"real madrid" + 0.029*"fracaso"'),
 (2,
  '0.127*"banco" + 0.081*"dinero" + 0.047*"prestar" + 0.047*"transacciones" + 0.047*"finanzas" + 0.046*"cancelacion" + 0.046*"politica" + 0.046*"presupuestos" + 0.046*"prestamos" + 0.046*"acuerdos"')]

In [56]:
doc_lab =  ["interfaz","humanos","laboratorios","Madrid"]
doc_poli = ["presupuestos", "pedro sanchez", "cancelacion","acuerdos"]

### Once we have our documents, we quickly convert these into a bag of words, TF-IDF, and LdaModel representations

In [57]:
bow_lab = model.id2word.doc2bow(doc_lab)
bow_poli = model.id2word.doc2bow(doc_poli)

lda_bow_lab = model[bow_lab]
lda_bow_poli = model[bow_poli]

tfidf_bow_lab = tfidf[lda_bow_lab]
tfidf_bow_poli = tfidf[lda_bow_poli]

##### Let's have a look at lda_bow_poli and see what it looks like:

In [58]:
lda_bow_poli

[(0, 0.08424669), (1, 0.08379577), (2, 0.8319576)]

#### Similar Query

###### This is a well-balanced document with respect to the topics (as expected).

In [59]:
from gensim import similarities

In [60]:
# Create index
index = similarities.MatrixSimilarity(model[corpus])

We created our index based on the similarities created by the LDA transformation of our corpus. We can create the same index using TF-IDF, or even bag of words, but we can expect better performance when using topics. We should also keep in mind that our queries should be in the same input space as the representation in which we created our index.

In [61]:
# Let's use the same lda_bow_finance document and find which articles are most
# similar.
#sims = index[lda_bow_poli]
sims = index[tfidf_bow_poli]

In [62]:
# a list with each document and the corresponding similarity
# values.
print(sims)

[0.11023868 0.10541195 0.09040789 0.11018307 0.09719157 0.10522823
 0.9988203  0.9939853  0.9856416  0.11446519 0.12916419 0.13761894
 0.9977799  0.1469763  0.17191184 0.16252382]


In [63]:
# Let's look at which documents were actually picked up, and sort them according to how
# similar they are.
print("search: ",doc_poli)
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_id, similarity in sims:
    print(texts[doc_id], similarity)

search:  ['presupuestos', 'pedro sanchez', 'cancelacion', 'acuerdos']
['dinero', 'transacciones', 'banco', 'finanzas', 'prestamos'] 0.9988203
['politica', 'presupuestos', 'cancelacion', 'acuerdos'] 0.9977799
['banco', 'prestar', 'dinero'] 0.9939853
['banco', 'financiero'] 0.9856416
['marron', 'vender'] 0.17191184
['banco', 'prestar', 'vender'] 0.16252382
['financiacion', 'dinero', 'venta', 'banco'] 0.1469763
['humanos', 'tecnologia', 'Toledo'] 0.13761894
['final', 'real madrid', 'fracaso'] 0.12916419
['interfaz', 'humanos', 'laboratorios', 'Madrid'] 0.11446519
['banco', 'mar', 'curtar', 'agua'] 0.11023868
['banco', 'agua', 'invierno', 'suelo'] 0.11018307
['mar', 'surfear', 'cortar', 'agua'] 0.105411954
['mar', 'agua', 'nubes', 'arboles'] 0.10522823
['banco', 'banca', 'agua', 'llover', 'mar'] 0.09719157
['mar', 'agua', 'cangrejo', 'lento', 'arboles'] 0.09040789
