In [52]:
import gensim
import pandas as pd

In [3]:
print(gensim.__version__)

3.5.0


In [22]:
doc = 1

In [23]:
import os

# Folder containing all NIPS papers.
data_dir = './TXTS'

# Read all texts into a list.
docs = []
files = os.listdir(data_dir)
for filen in files:
    with open(data_dir + '/' + filen, errors='ignore') as fid:
        txt = fid.read().replace('\n', '')
    docs.append(txt)

In [54]:
pd.DataFrame({"text": docs})

Unnamed: 0,text
0,"[bud1, dsdb]"
1,"[burlington, primitive, non, european, art, in..."
2,"[correction, charles, of, lorraine, audience, ..."
3,"[more, and, more, important, work, roger, fry,..."
4,"[holmes, fry, jaccaci, and, the, art, in, amer..."
5,"[the, second, number, of, the, burlington, mag..."
6,"[retrospect, of, 1909source, the, burlington, ..."
7,"[the, burlington, magazine, and, the, wax, bus..."
8,"[dr, bode, and, the, burlington, magazine, aut..."
9,"[signor, virzi, pictureauthor, dan, fellow, pl..."


In [25]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [31]:
# Lemmatize the documents.

from nltk.stem.wordnet import WordNetLemmatizer

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [32]:
docs

[['bud1', 'dsdb'],
 ['burlington',
  'primitive',
  'non',
  'european',
  'art',
  'in',
  'the',
  'burlington',
  'magazine',
  'before',
  '1930author',
  'colin',
  'rhodessource',
  'the',
  'burlington',
  'magazine',
  'vol',
  'no',
  'feb',
  'pp',
  '104published',
  'by',
  'burlington',
  'magazine',
  'publication',
  'ltd',
  'stable',
  'url',
  'http',
  'www',
  'jstor',
  'org',
  'stable',
  '20073399accessed',
  'utc',
  'jstor',
  'is',
  'not',
  'for',
  'profit',
  'service',
  'that',
  'help',
  'scholar',
  'researcher',
  'and',
  'student',
  'discover',
  'use',
  'and',
  'build',
  'upon',
  'widerange',
  'of',
  'content',
  'in',
  'trusted',
  'digital',
  'archive',
  'we',
  'use',
  'information',
  'technology',
  'and',
  'tool',
  'to',
  'increase',
  'productivity',
  'andfacilitate',
  'new',
  'form',
  'of',
  'scholarship',
  'for',
  'more',
  'information',
  'about',
  'jstor',
  'please',
  'contact',
  'support',
  'jstor',
  'org',

In [34]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_above=0.5)

2018-08-15 15:30:29,694 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-15 15:30:29,734 : INFO : built Dictionary(8305 unique tokens: ['bud1', 'dsdb', '101continuous', '103continuous', '104continuous']...) from 21 documents (total 55467 corpus positions)
2018-08-15 15:30:29,744 : INFO : discarding 7687 tokens: [('bud1', 1), ('dsdb', 1), ('101continuous', 1), ('103continuous', 1), ('104continuous', 1), ('104published', 1), ('16th', 3), ('1930author', 1), ('19th', 3), ('20073399accessed', 1)]...
2018-08-15 15:30:29,744 : INFO : keeping 618 tokens which were in no less than 5 and no more than 10 (=50.0%) documents
2018-08-15 15:30:29,747 : INFO : resulting dictionary: Dictionary(618 unique tokens: ['able', 'academy', 'accept', 'account', 'acquisition']...)


In [35]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [36]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 618
Number of documents: 21


In [49]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 6
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

2018-08-15 15:35:02,063 : INFO : using autotuned alpha, starting with [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667]
2018-08-15 15:35:02,064 : INFO : using serial LDA version on this node
2018-08-15 15:35:02,065 : INFO : running online (multi-pass) LDA training, 6 topics, 20 passes over the supplied corpus of 21 documents, updating model once every 21 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2018-08-15 15:35:02,066 : INFO : PROGRESS: pass 0, at document #21/21
2018-08-15 15:35:02,200 : INFO : optimized alpha [0.11511856, 0.13522185, 0.066689305, 0.1458907, 0.0484263, 0.13374706]
2018-08-15 15:35:02,202 : INFO : topic #4 (0.048): 0.019*"american" + 0.018*"collection" + 0.016*"fry" + 0.012*"section" + 0.011*"holmes" + 0.010*"first_number" + 0.009*"america" + 0.008*"new_york" + 0.008*"drawing" + 0.008*"master"
2018-08-15 15:35:02,202 : INFO : topic #2 (0.067): 0.039*"drawing" + 0.017*"etching" + 0

2018-08-15 15:35:02,413 : INFO : topic #5 (0.063): 0.026*"drawing" + 0.022*"poussin" + 0.018*"exhibition" + 0.016*"september" + 0.015*"anthony" + 0.014*"collection" + 0.012*"architecture" + 0.010*"master" + 0.010*"august" + 0.009*"october"
2018-08-15 15:35:02,413 : INFO : topic #1 (0.074): 0.035*"fry" + 0.014*"european" + 0.008*"read" + 0.007*"op" + 0.007*"culture" + 0.007*"roger" + 0.007*"national" + 0.007*"ii" + 0.007*"drawing" + 0.007*"der"
2018-08-15 15:35:02,414 : INFO : topic diff=0.064028, rho=0.377964
2018-08-15 15:35:02,416 : INFO : PROGRESS: pass 6, at document #21/21
2018-08-15 15:35:02,444 : INFO : optimized alpha [0.050915107, 0.0699259, 0.030456554, 0.052823924, 0.026252437, 0.05940056]
2018-08-15 15:35:02,446 : INFO : topic #4 (0.026): 0.020*"first_number" + 0.014*"illustration" + 0.013*"will_be" + 0.013*"we_have" + 0.013*"periodical" + 0.011*"plate" + 0.011*"paper" + 0.011*"half" + 0.011*"gazette" + 0.010*"high"
2018-08-15 15:35:02,447 : INFO : topic #2 (0.030): 0.034*"

2018-08-15 15:35:02,609 : INFO : topic #5 (0.050): 0.025*"drawing" + 0.022*"poussin" + 0.018*"exhibition" + 0.016*"september" + 0.015*"anthony" + 0.014*"collection" + 0.012*"architecture" + 0.010*"master" + 0.010*"august" + 0.008*"october"
2018-08-15 15:35:02,610 : INFO : topic #1 (0.059): 0.031*"fry" + 0.015*"european" + 0.009*"read" + 0.008*"culture" + 0.007*"drawing" + 0.007*"der" + 0.007*"ii" + 0.007*"dr" + 0.007*"national" + 0.006*"non"
2018-08-15 15:35:02,611 : INFO : topic diff=0.022562, rho=0.277350
2018-08-15 15:35:02,615 : INFO : PROGRESS: pass 12, at document #21/21
2018-08-15 15:35:02,636 : INFO : optimized alpha [0.0386014, 0.057201535, 0.022871781, 0.039969973, 0.020772632, 0.048485104]
2018-08-15 15:35:02,638 : INFO : topic #4 (0.021): 0.018*"first_number" + 0.013*"illustration" + 0.012*"will_be" + 0.012*"we_have" + 0.012*"periodical" + 0.011*"plate" + 0.010*"paper" + 0.010*"half" + 0.010*"high" + 0.010*"gazette"
2018-08-15 15:35:02,638 : INFO : topic #2 (0.023): 0.030*"

2018-08-15 15:35:02,798 : INFO : topic #5 (0.044): 0.025*"drawing" + 0.021*"poussin" + 0.017*"exhibition" + 0.016*"september" + 0.014*"anthony" + 0.014*"collection" + 0.012*"architecture" + 0.010*"master" + 0.009*"august" + 0.008*"october"
2018-08-15 15:35:02,799 : INFO : topic #1 (0.051): 0.029*"fry" + 0.015*"european" + 0.009*"read" + 0.008*"culture" + 0.008*"drawing" + 0.007*"der" + 0.007*"dr" + 0.006*"non" + 0.006*"ii" + 0.006*"national"
2018-08-15 15:35:02,799 : INFO : topic diff=0.013777, rho=0.229416
2018-08-15 15:35:02,802 : INFO : PROGRESS: pass 18, at document #21/21
2018-08-15 15:35:02,825 : INFO : optimized alpha [0.033238247, 0.04953695, 0.019196494, 0.03431687, 0.017864032, 0.04313638]
2018-08-15 15:35:02,827 : INFO : topic #4 (0.018): 0.017*"first_number" + 0.012*"illustration" + 0.012*"will_be" + 0.011*"we_have" + 0.011*"periodical" + 0.010*"plate" + 0.010*"paper" + 0.009*"half" + 0.009*"high" + 0.009*"gazette"
2018-08-15 15:35:02,828 : INFO : topic #2 (0.019): 0.029*"d

CPU times: user 770 ms, sys: 137 ms, total: 907 ms
Wall time: 799 ms


In [50]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.6128.
[([(0.04982263, 'fry'),
   (0.020442614, 'holmes'),
   (0.01992519, 'american'),
   (0.019341514, 'collection'),
   (0.012865943, 'section'),
   (0.011722474, 'america'),
   (0.011219491, 'march'),
   (0.010747045, 'ibid'),
   (0.00972268, 'roger'),
   (0.009482493, 'roger_fry'),
   (0.008330205, 'cit'),
   (0.008269081, 'op'),
   (0.008257888, 'op_cit'),
   (0.007351685, 'york'),
   (0.0073387674, 'new_york'),
   (0.0072999257, 'national'),
   (0.0072744447, 'august'),
   (0.0069008004, 'home'),
   (0.006844556, 'tone'),
   (0.0067402516, 'fig')],
  -0.5617223634635353),
 ([(0.024802752, 'drawing'),
   (0.020630341, 'poussin'),
   (0.017181966, 'exhibition'),
   (0.016096015, 'september'),
   (0.014167651, 'anthony'),
   (0.014108071, 'collection'),
   (0.012348512, 'architecture'),
   (0.009522924, 'master'),
   (0.009318345, 'august'),
   (0.008036637, 'october'),
   (0.0073349415, 'then'),
   (0.0068901526, 'la'),
   (0.006368306, 'catalogue'),
   

In [51]:
import pyLDAvis
import pyLDAvis.gensim
lda_vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
