# Пример использования Gensim

In [27]:
from gensim import corpora, models
import os
from collections import defaultdict
import nltk

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dasha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [26]:
with open('text.txt', 'r') as file: 
    documents = [doc.strip() for doc in file.readlines()]

In [30]:
stop_words = nltk.corpus.stopwords.words('english')

In [33]:
stop_words[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [34]:
texts = [[word for word in document.lower().split() if len(word) >= 3 and word not in stop_words] 
         for document in documents]

In [35]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [36]:
dictionary = defaultdict(int)

In [37]:
for text in texts: 
    for word in text: 
        dictionary[word] += 1

In [38]:
tokens = [[token for token in text if dictionary[token] > 1] for text in texts]

In [39]:
tokens

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [40]:
token_dictionary = corpora.Dictionary(tokens)

In [41]:
corpus = [token_dictionary.doc2bow(text) for text in tokens]

In [42]:
lsi_model = models.LsiModel(corpus, id2word=token_dictionary, num_topics=2)

In [43]:
corpus_lsi = lsi_model[corpus]

In [44]:
for doc, as_text in zip(documents, corpus_lsi): 
    print(as_text, doc)

[(0, 0.6594664059797392), (1, -0.14211544403729742)] Human machine interface for lab abc computer applications
[(0, 2.0245430433828773), (1, 0.4208875824630156)] A survey of user opinion of computer system response time
[(0, 1.5465535813286557), (1, -0.3235891942571172)] The EPS user interface management system
[(0, 1.8111412473028845), (1, -0.5890524969932394)] System and human system engineering testing of EPS
[(0, 0.9336738035634355), (1, 0.271389404993745)] Relation of user perceived response time to error measurement
[(0, 0.012746183038294686), (1, 0.49016179245310576)] The generation of random binary unordered trees
[(0, 0.048882032060470815), (1, 1.1129470269929582)] The intersection graph of paths in trees
[(0, 0.08063836099410687), (1, 1.5634559463442699)] Graph minors IV Widths of trees and well quasi ordering
[(0, 0.2738100392127577), (1, 1.3469415849537718)] Graph minors A survey


In [45]:
lda = models.LdaModel(corpus, id2word=token_dictionary, num_topics=2)

In [46]:
corpus_lda = lda[corpus]

In [47]:
lda.print_topics()

[(0,
  '0.148*"system" + 0.100*"graph" + 0.090*"human" + 0.088*"computer" + 0.087*"survey" + 0.084*"minors" + 0.083*"interface" + 0.073*"trees" + 0.071*"user" + 0.068*"eps"'),
 (1,
  '0.134*"user" + 0.131*"trees" + 0.099*"time" + 0.099*"response" + 0.095*"graph" + 0.086*"system" + 0.080*"eps" + 0.060*"interface" + 0.059*"minors" + 0.054*"survey"')]

In [48]:
for doc, as_text in zip(documents, corpus_lda): 
    print(as_text, doc)

[(0, 0.85168225), (1, 0.14831774)] Human machine interface for lab abc computer applications
[(0, 0.5488048), (1, 0.45119512)] A survey of user opinion of computer system response time
[(0, 0.70600754), (1, 0.2939924)] The EPS user interface management system
[(0, 0.8691444), (1, 0.13085559)] System and human system engineering testing of EPS
[(0, 0.15082721), (1, 0.84917283)] Relation of user perceived response time to error measurement
[(0, 0.30285984), (1, 0.69714016)] The generation of random binary unordered trees
[(0, 0.2724569), (1, 0.7275431)] The intersection graph of paths in trees
[(0, 0.56811506), (1, 0.4318849)] Graph minors IV Widths of trees and well quasi ordering
[(0, 0.83587486), (1, 0.16412519)] Graph minors A survey


In [49]:
data = corpora.UciCorpus('docword.xkcd.txt', 'vocab.xkcd.txt')

In [50]:
dictionary = data.create_dictionary()

In [51]:
lda = models.ldamodel.LdaModel(data, id2word=dictionary, passes=10, num_topics=5, alpha=1.25, eta=1.25)

In [52]:
lda.save('lda_xkcd')

In [53]:
for t, top_words in lda.print_topics(): 
    print(t, ':', top_words)

0 : 0.002*"b'text'" + 0.002*"b'title'" + 0.002*"b'paul'" + 0.001*"b'ron'" + 0.001*"b'labeled'" + 0.001*"b'line'" + 0.001*"b'red'" + 0.001*"b'leopard'" + 0.001*"b'hit'" + 0.001*"b'human'"
1 : 0.019*"b'person'" + 0.008*"b'text'" + 0.008*"b'girl'" + 0.008*"b'title'" + 0.006*"b'one'" + 0.005*"b'guy'" + 0.003*"b'right'" + 0.003*"b'boy'" + 0.003*"b'people'" + 0.003*"b'just'"
2 : 0.005*"b'person'" + 0.003*"b'guy'" + 0.002*"b'text'" + 0.001*"b'title'" + 0.001*"b'page'" + 0.001*"b'goggles'" + 0.001*"b'link'" + 0.001*"b'one'" + 0.001*"b'error'" + 0.001*"b'people'"
3 : 0.032*"b'man'" + 0.013*"b'woman'" + 0.010*"b'text'" + 0.008*"b'title'" + 0.005*"b'one'" + 0.005*"b'just'" + 0.004*"b'computer'" + 0.004*"b'two'" + 0.004*"b'figure'" + 0.003*"b'panel'"
4 : 0.012*"b'guy'" + 0.010*"b'hat'" + 0.004*"b'black'" + 0.003*"b'text'" + 0.002*"b'title'" + 0.001*"b'woman'" + 0.001*"b'man'" + 0.001*"b'artist'" + 0.001*"b'elaine'" + 0.001*"b'two'"
