# Modelando assuntos

In [1]:
from gensim import corpora, models, similarities
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from string import punctuation
from pprint import pprint

Using gpu device 0: GeForce GT 640 (CNMeM is disabled, cuDNN 5004)


## Definindo um problema

A epidemia de Zika gerou uma epidemia de publicações científicas sobre o assunto. Como é uma doença pouco conhecida, pesquisas em diversas áreas do conhecimento precisam ser conduzidas para poder preencher a lacuna da nossa ignorância.

Neste exercício vamos analizar um corpus de resumos de artigos publicados sobre o assunto e tentar modelar os assuntos existentes.

In [4]:
dicionario = corpora.Dictionary.load('Dicionario_zika.dict')
corpus = corpora.MmCorpus('corpus_zika')

In [8]:
print(dicionario)
print(corpus)
498*5886

Dictionary(5886 unique tokens: ['fattening', 'x894', 'Mercer', 'hosts', 'artery']...)
MmCorpus(498 documents, 5886 features, 24027 non-zero entries)


2931228

In [11]:
print(dicionario[0])
for doc in corpus:
    print(doc)
    break

virus
[(0, 2.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 3.0), (6, 1.0), (7, 2.0), (8, 3.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 2.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 2.0), (17, 1.0), (18, 2.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 2.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 2.0), (31, 1.0), (32, 2.0), (33, 1.0), (34, 1.0), (35, 2.0), (36, 1.0), (37, 3.0), (38, 3.0), (39, 1.0), (40, 3.0), (41, 1.0), (42, 1.0), (43, 2.0), (44, 1.0), (45, 2.0), (46, 3.0), (47, 3.0), (48, 5.0), (49, 1.0), (50, 1.0), (51, 2.0), (52, 2.0), (53, 1.0), (54, 1.0), (55, 1.0), (56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0), (60, 1.0), (61, 1.0), (62, 2.0), (63, 1.0), (64, 1.0), (65, 1.0), (66, 1.0), (67, 3.0), (68, 3.0), (69, 1.0), (70, 2.0), (71, 1.0), (72, 1.0), (73, 1.0), (74, 1.0), (75, 1.0), (76, 2.0), (77, 1.0), (78, 5.0), (79, 1.0), (80, 1.0), (81, 1.0), (82, 1.0), (83, 12.0), (84, 1.0), (85, 1.0), (86, 1.0), (87, 1.0), (88, 1.0), (89, 2.0), (90, 2.0), (

## Latent Semantic Indexing - LSI

In [12]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [13]:
lsi = models.LsiModel(corpus_tfidf, id2word=dicionario, num_topics=30)
corpus_lsi = lsi[corpus_tfidf]

Depois de estimar o modelo, podemos olhar os 30 assuntos, listando apenas as 4 palavras mais importantes.

In [14]:
lsi.show_topics(30,4)

[(0, '0.342*"ZIKV" + 0.259*"virus" + 0.192*"Zika" + 0.151*"infection"'),
 (1, '-0.516*"ZIKV" + 0.242*"women" + 0.188*"virus" + 0.179*"pregnant"'),
 (2, '-0.337*"ZIKV" + 0.251*"YF" + 0.200*"antibodies" + 0.194*"antibody"'),
 (3, '-0.275*"ZIKV" + -0.171*"women" + 0.135*"spread" + -0.116*"pregnant"'),
 (4,
  '0.373*"Ae" + 0.177*"transmission" + -0.156*"microcephaly" + 0.141*"aegypti"'),
 (5,
  '-0.281*"YF" + 0.202*"patients" + -0.165*"microcephaly" + -0.149*"pregnancy"'),
 (6, '-0.250*"Ae" + 0.199*"ZIKV" + -0.172*"Brazil" + -0.157*"microcephaly"'),
 (7, '-0.396*"YF" + 0.129*"viruses" + 0.119*"blood" + 0.119*"public"'),
 (8, '-0.157*"sequence" + 0.146*"Ae" + -0.143*"sequences" + 0.136*"ZIKV"'),
 (9, '-0.387*"YF" + -0.165*"Ae" + 0.141*"viruses" + -0.130*"saliva"'),
 (10, '0.200*"Guillain" + 0.198*"BarrÃ" + 0.194*"syndrome" + 0.186*"©"'),
 (11, '0.204*"blood" + -0.180*"Ae" + -0.119*"infection" + -0.116*"returning"'),
 (12,
  '-0.249*"antibody" + -0.187*"enhancement" + -0.186*"heterologous" +

Podemos também olhar para os documentos do nosso corpus como uma combinação linear dos assuntos

In [15]:
for doc  in corpus_lsi:
    print(doc)
    break

[(0, 0.29341359575061515), (1, -0.044658797891151791), (2, -0.056700593373754361), (3, -0.3351010693844646), (4, 0.015495065028631893), (5, -0.083301342437953452), (6, -0.023842338412340961), (7, 0.081378555864377089), (8, 0.05638766110989972), (9, -0.031680886986972183), (10, 0.072017301610918294), (11, 0.054245633991349809), (12, 0.076329205826558846), (13, -0.0070387571250865536), (14, -0.11017238855926909), (15, 0.030608301182679713), (16, -0.053418795588331985), (17, -0.1130915871885457), (18, 0.097459120095664917), (19, 0.018477422670524206), (20, -0.045325825065361286), (21, -0.013672952175308169), (22, -0.033956342373825485), (23, -0.0053491432979650005), (24, 0.083119597914713225), (25, 0.021412510661699521), (26, 0.063817988915913892), (27, -0.066603861567659711), (28, 0.022462657935527139), (29, -0.024702260455466089)]


Podemos calcular a similaridade por assunto de um documento com todos os demais documentos do corpus.

In [16]:
index = similarities.MatrixSimilarity(corpus_lsi)

Vamos escolher o primeiro documento do corpus para ser a referência

In [17]:
sims = index[doc]
#pprint(list(enumerate(sims)))
pprint(sorted(list(enumerate(sims)), key=lambda x:x[1], reverse=True)[:10])

[(0, 1.0),
 (430, 0.71284831),
 (12, 0.61172044),
 (494, 0.59754914),
 (11, 0.58494759),
 (6, 0.57779515),
 (401, 0.57517916),
 (477, 0.54245442),
 (27, 0.53723323),
 (14, 0.52353585)]


## Latent Dirichlet Allocation - LDA

O LDA é uma técnica um pouco mais sofisticada que o LSI, que envolve uma interpretação probabilística do que é uma assunto. Para saber mais, veja este artigo: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf

In [18]:
lda_model = models.ldamodel.LdaModel(corpus, id2word=dicionario, num_topics=30, passes=10)

In [19]:
lda_model.show_topics(10)

[(8,
  "0.019*virus + 0.013*Zika + 0.011*The + 0.010*antibody + 0.009*cases + 0.009*viruses + 0.008*sera + 0.008*infection + 0.008*.'] + 0.008*['"),
 (19,
  '0.045*virus + 0.036*Zika + 0.020*transmission + 0.013*infection + 0.012*women + 0.010*pregnant + 0.009*health + 0.008*disease + 0.008*pregnancy + 0.006*ZIKV'),
 (17,
  '0.039*ZIKV + 0.013*brain + 0.012*), + 0.010*fetal + 0.010*microcephaly + 0.009*malaria + 0.009*RNA + 0.008*infections + 0.008*The + 0.008*gestation'),
 (28,
  '0.015*infection + 0.011*enhancement + 0.011*antibody + 0.009*showed + 0.008*dengue + 0.008*Africa + 0.008*The + 0.008*virus + 0.007*produced + 0.007*enhancing'),
 (5,
  '0.036*rabbit + 0.011*gene + 0.011*blood + 0.010*weight + 0.008*05 + 0.008*P + 0.008*genotype + 0.008*Zika + 0.007*0 + 0.007*ZIKAV'),
 (4,
  '0.017*virus + 0.013*viral + 0.013*nyong + 0.011*fever + 0.011*The + 0.008*area + 0.007*transmission + 0.006*areas + 0.006*exanthems + 0.006*infections'),
 (15,
  "0.032*virus + 0.017*Zika + 0.008*.'] + 

In [16]:
lsi.save?