[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

# Пример использования библиотеки gensim для тематического моделирования

In [4]:
import requests

url = 'https://raw.githubusercontent.com/chekhovana/courses/main/machine_learning/3_unsupervised_learning/4_topic_modeling/data/'
docword_fname = '4.3.1_docword.xkcd.txt'
vocab_fname = '4.3.1_vocab.xkcd.txt'
for fname in (docword_fname, vocab_fname):
    r = requests.get(url + fname)
    open(fname, 'wb').write(r.content)

In [6]:
from gensim import corpora, models

In [7]:
# Импортируем данные в формте UCI Bag of Words
data = corpora.UciCorpus(docword_fname, vocab_fname)
dictionary = data.create_dictionary()
print(len(data))

1265


In [None]:
# обучение модель
%time ldamodel = models.ldamodel.LdaModel(data, id2word=dictionary, num_topics=5, passes=20, alpha=1.25, eta=1.25)

CPU times: user 30.4 s, sys: 437 ms, total: 30.8 s
Wall time: 30.9 s


In [None]:
# Сохранение модели
ldamodel.save("ldamodel_xkcd")

In [None]:
# Загрузка модели
ldamodel = models.ldamodel.LdaModel.load("ldamodel_xkcd")

In [None]:
# выводим топы слов
for t, top_words in ldamodel.print_topics(num_topics=10, num_words=10):
    print("Topic", t, ":", top_words)

Topic 0 : 0.002*"b'scientist'" + 0.001*"b'wikipedia'" + 0.001*"b'island'" + 0.001*"b'girl'" + 0.001*"b'text'" + 0.001*"b'list'" + 0.001*"b'reporter'" + 0.001*"b'han'" + 0.001*"b'title'" + 0.001*"b'map'"
Topic 1 : 0.036*"b'person'" + 0.001*"b'error'" + 0.001*"b'people'" + 0.001*"b'title'" + 0.001*"b'page'" + 0.001*"b'text'" + 0.001*"b'two'" + 0.001*"b'bag'" + 0.001*"b'team'" + 0.001*"b'one'"
Topic 2 : 0.024*"b'man'" + 0.012*"b'text'" + 0.011*"b'title'" + 0.010*"b'woman'" + 0.008*"b'guy'" + 0.007*"b'one'" + 0.005*"b'girl'" + 0.005*"b'just'" + 0.005*"b'hat'" + 0.005*"b'two'"
Topic 3 : 0.002*"b'paul'" + 0.002*"b'ron'" + 0.001*"b'chart'" + 0.001*"b'degree'" + 0.001*"b'day'" + 0.001*"b'mark'" + 0.001*"b'map'" + 0.001*"b'planet'" + 0.001*"b'gliese'" + 0.001*"b'easy'"
Topic 4 : 0.002*"b'wait'" + 0.002*"b'goggles'" + 0.001*"b'sagal'" + 0.001*"b'peter'" + 0.001*"b'link'" + 0.001*"b'jelly'" + 0.001*"b'bean'" + 0.001*"b'found'" + 0.001*"b'acne'" + 0.001*"b'005'"


In [None]:
# Вычисляем логарифм перплексии и немного преобразуем, чтобы привести к общепринятому виду
# perplexity = ldamodel.log_perplexity(list(data))
# print(2**(-perplexity))
ldamodel.log_perplexity(list(data))

-8.459189566500669

In [None]:
perp = ldamodel.bound(data)
2**(-perp/float(87409))

351.94095225724567

In [None]:
# Получение распределения тем для конкретного документа
doc = list(data)[0]
ldamodel.get_document_topics(doc)

[(0, 0.06704308),
 (1, 0.07055924),
 (2, 0.74792206),
 (3, 0.056530956),
 (4, 0.057944655)]