# Prática 3 - Modelando assuntos

In [11]:
from gensim import corpora, models, similarities
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from string import punctuation
from pprint import pprint

## Definindo um problema

A epidemia de Zika gerou uma epidemia de publicações científicas sobre o assunto. Como é uma doença pouco conhecida, pesquisas em diversas áreas do conhecimento precisam ser conduzidas para poder preencher a lacuna da nossa ignorância.

Neste exercício vamos analizar um corpus de resumos de artigos publicados sobre o assunto e tentar modelar os assuntos existentes.

In [2]:
dicionario = corpora.Dictionary.load('Dicionario_zika.dict')
corpus = corpora.MmCorpus('corpus_zika')

In [4]:
print(dicionario)
print(corpus)
498*5886

Dictionary(5886 unique tokens: ['fold', 'occurrences', 'Ug', 'responsible', 'lumbar']...)
MmCorpus(498 documents, 5886 features, 24027 non-zero entries)


2931228

In [16]:
print(dicionario[0])
for doc in corpus[:3]:
    print(doc)

virus
[(0, 2.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 3.0), (6, 1.0), (7, 2.0), (8, 3.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 2.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 2.0), (17, 1.0), (18, 2.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 2.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 2.0), (31, 1.0), (32, 2.0), (33, 1.0), (34, 1.0), (35, 2.0), (36, 1.0), (37, 3.0), (38, 3.0), (39, 1.0), (40, 3.0), (41, 1.0), (42, 1.0), (43, 2.0), (44, 1.0), (45, 2.0), (46, 3.0), (47, 3.0), (48, 5.0), (49, 1.0), (50, 1.0), (51, 2.0), (52, 2.0), (53, 1.0), (54, 1.0), (55, 1.0), (56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0), (60, 1.0), (61, 1.0), (62, 2.0), (63, 1.0), (64, 1.0), (65, 1.0), (66, 1.0), (67, 3.0), (68, 3.0), (69, 1.0), (70, 2.0), (71, 1.0), (72, 1.0), (73, 1.0), (74, 1.0), (75, 1.0), (76, 2.0), (77, 1.0), (78, 5.0), (79, 1.0), (80, 1.0), (81, 1.0), (82, 1.0), (83, 12.0), (84, 1.0), (85, 1.0), (86, 1.0), (87, 1.0), (88, 1.0), (89, 2.0), (90, 2.0), (

## Latent Semantic Indexing - LSI

In [18]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [55]:
lsi = models.LsiModel(corpus_tfidf, id2word=dicionario, num_topics=30)
corpus_lsi = lsi[corpus_tfidf]

Depois de estimar o modelo, podemos olhar os 30 assuntos, listando apenas as 4 palavras mais importantes.

In [56]:
lsi.show_topics(30,4)

['0.342*"ZIKV" + 0.259*"virus" + 0.192*"Zika" + 0.151*"infection"',
 '-0.516*"ZIKV" + 0.241*"women" + 0.189*"virus" + 0.179*"pregnant"',
 '0.338*"ZIKV" + -0.251*"YF" + -0.200*"antibodies" + -0.194*"antibody"',
 '-0.278*"ZIKV" + -0.172*"women" + 0.136*"spread" + -0.117*"pregnant"',
 '-0.370*"Ae" + -0.178*"transmission" + 0.158*"microcephaly" + -0.141*"aegypti"',
 '0.279*"YF" + -0.205*"patients" + 0.168*"microcephaly" + 0.145*"pregnancy"',
 '0.247*"Ae" + -0.200*"ZIKV" + 0.174*"Brazil" + 0.161*"microcephaly"',
 '-0.406*"YF" + 0.124*"viruses" + 0.117*"public" + 0.112*"blood"',
 '0.161*"sequence" + 0.150*"sequences" + -0.143*"Ae" + -0.135*"ZIKV"',
 '-0.395*"YF" + -0.172*"Ae" + 0.149*"viruses" + -0.128*"©"',
 '-0.207*"Guillain" + -0.205*"BarrÃ" + -0.199*"syndrome" + -0.195*"©"',
 '0.211*"blood" + -0.166*"Ae" + 0.117*"donors" + -0.117*"returning"',
 '0.255*"antibody" + 0.198*"enhancement" + 0.193*"heterologous" + -0.150*"yellow"',
 '-0.221*"YF" + 0.145*"antibody" + -0.128*"abnormalities" + 0.

Podemos também olhar para os documentos do nosso corpus como uma combinação linear dos assuntos

In [57]:
for doc  in corpus_lsi:
    print(doc)
    break

[(0, 0.29346796889527155), (1, -0.046389179323431673), (2, 0.059619411478518555), (3, -0.33877360175144577), (4, -0.0017954990630196757), (5, 0.087521432463480722), (6, 0.015937793679183316), (7, 0.081981770852204192), (8, -0.051373652046324991), (9, -0.043190033272362288), (10, -0.082051923922451933), (11, 0.043541847929578811), (12, -0.06915202302904816), (13, -0.0168587450133801), (14, -0.11638642658052298), (15, 0.048552055176955322), (16, -0.064320124527728437), (17, -0.1037577452718921), (18, 0.10788954481007815), (19, -0.023079798657641648), (20, -0.080320770078909753), (21, -0.021998957134702582), (22, -0.0068806149580824261), (23, 0.033295097113836158), (24, -0.092836130281484117), (25, -0.031766843726336404), (26, 0.056310741182633697), (27, 0.052524222898878196), (28, -0.017710845703593502), (29, 0.015903933190004976)]


Podemos calcular a similaridade por assunto de um documento com todos os demais documentos do corpus.

In [58]:
index = similarities.MatrixSimilarity(corpus_lsi)

Vamos escolher o primeiro documento do corpus para ser a referência

In [59]:
sims = index[doc]
#pprint(list(enumerate(sims)))
pprint(sorted(list(enumerate(sims)), key=lambda x:x[1], reverse=True))

[(0, 0.99999982),
 (430, 0.72549403),
 (401, 0.60872328),
 (6, 0.59549123),
 (12, 0.58171809),
 (27, 0.57662165),
 (494, 0.55630046),
 (11, 0.52500963),
 (118, 0.51750726),
 (2, 0.51049447),
 (10, 0.50143945),
 (181, 0.49604842),
 (69, 0.4917191),
 (477, 0.48722857),
 (14, 0.48322293),
 (136, 0.48118821),
 (400, 0.4778831),
 (464, 0.47529486),
 (476, 0.47422501),
 (19, 0.46904218),
 (28, 0.46404377),
 (32, 0.46105316),
 (146, 0.45609295),
 (7, 0.45096427),
 (16, 0.44940519),
 (42, 0.44707882),
 (327, 0.4418284),
 (331, 0.43761984),
 (381, 0.43488216),
 (391, 0.43355265),
 (438, 0.43186057),
 (39, 0.42472613),
 (402, 0.41902143),
 (443, 0.41847616),
 (428, 0.41725233),
 (4, 0.40815341),
 (423, 0.39857677),
 (31, 0.39586726),
 (174, 0.3942855),
 (343, 0.38758308),
 (15, 0.38606316),
 (44, 0.38249546),
 (127, 0.38134465),
 (436, 0.3808943),
 (24, 0.37842748),
 (46, 0.37477848),
 (390, 0.36867413),
 (287, 0.36813515),
 (79, 0.36283866),
 (173, 0.36114985),
 (386, 0.35697776),
 (29, 0.35541

## Latent Dirichlet Allocation - LDA

O LDA é uma técnica um pouco mais sofisticada que o LSI, que envolve uma interpretação probabilística do que é uma assunto. Para saber mais, veja este artigo: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf

In [61]:
lda_model = models.ldamodel.LdaModel(corpus, id2word=dicionario, num_topics=30)

  expElogbetad = self.expElogbeta[:, ids]
  score += numpy.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, id]) for id, cnt in doc)
  sstats[:, ids] += numpy.outer(expElogthetad.T, cts / phinorm)


In [62]:
lda_model.show_topics(30)

["0.023*virus + 0.012*Zika + 0.011*The + 0.009*fever + 0.008*), + 0.008*.'] + 0.007*[' + 0.007*viruses + 0.006*ZIKV + 0.005*1",
 "0.023*virus + 0.017*Zika + 0.011*antibody + 0.010*infection + 0.009*The + 0.007*fever + 0.006*YF + 0.006*dengue + 0.005*antibodies + 0.005*.']",
 "0.021*virus + 0.018*Zika + 0.013*ZIKV + 0.008*The + 0.007*.', + 0.006*patients + 0.006*1 + 0.006*transmission + 0.006*outbreak + 0.006*mosquito",
 "0.014*YF + 0.009*virus + 0.009*.', + 0.008*infection + 0.007*Aedes + 0.007*Zika + 0.006*aegypti + 0.006*ZIKV + 0.006*The + 0.006*vector",
 "0.014*ZIKV + 0.010*Zika + 0.008*virus + 0.008*The + 0.007*cases + 0.006*[' + 0.006*microcephaly + 0.005*infection + 0.005*), + 0.005*.']",
 "0.031*virus + 0.022*Zika + 0.017*ZIKV + 0.013*infection + 0.010*The + 0.008*transmission + 0.007*.'] + 0.007*dengue + 0.007*microcephaly + 0.007*cases",
 '0.017*virus + 0.010*Zika + 0.008*fever + 0.008*The + 0.007*rabbits + 0.006*aegypti + 0.006*rabbit + 0.005*yellow + 0.005*viruses + 0.005*Ae