In [1]:
# imports
from collections import defaultdict
import wget
from gensim import corpora, models
import pandas as pd
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# upload data
path = '../Project_Gutenberg/'
file_name = 'dickens_novels.csv'
df = pd.read_csv(path + file_name)

In [4]:
df.head()

Unnamed: 0,author,title,text,lemmas
0,Dickens,Tale,"It was the best of times, it was the worst of ...",good time bad time age wisdom age foolishness ...
1,Dickens,Tale,There were a king with a large jaw and a queen...,king large jaw queen plain face throne england...
2,Dickens,Tale,It was the year of Our Lord one thousand seven...,year lord thousand seven seventy spiritual rev...
3,Dickens,Tale,"France, less favoured on the whole as to matte...",france favour matter spiritual sister shield t...
4,Dickens,Tale,"In England, there was scarcely an amount of or...",england scarcely order protection justify nati...


In [5]:
# prepare data
# extract
documents = df['lemmas'].to_list()
# tokenize
texts =[
    [word for word in document.lower().split()]
    for document in documents
]

In [12]:
# create a count of each token
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [13]:
frequency['the']

0

In [14]:
# remove words that appear only 1 time
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [15]:
# create dict
dictionary = corpora.Dictionary(texts)

In [16]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [17]:
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=60)

In [18]:
lda_model.print_topics()

[(0,
  '0.053*"say" + 0.045*"defarge" + 0.033*"madame" + 0.013*"monsieur" + 0.013*"remember" + 0.012*"carton" + 0.012*"sydney" + 0.012*"good" + 0.011*"lady" + 0.011*"spy"'),
 (1,
  '0.059*"say" + 0.052*"know" + 0.031*"dear" + 0.025*"tell" + 0.023*"think" + 0.022*"boy" + 0.019*"father" + 0.017*"shall" + 0.017*"miss" + 0.016*"child"'),
 (2,
  '0.116*"joe" + 0.052*"say" + 0.041*"come" + 0.030*"go" + 0.026*"sister" + 0.020*"get" + 0.019*"home" + 0.017*"day" + 0.016*"night" + 0.015*"good"'),
 (3,
  '0.015*"wine" + 0.013*"street" + 0.012*"shop" + 0.012*"man" + 0.009*"jacques" + 0.009*"drink" + 0.008*"saint" + 0.007*"antoine" + 0.007*"high" + 0.006*"people"'),
 (4,
  '0.063*"say" + 0.038*"gentleman" + 0.023*"laugh" + 0.022*"good" + 0.017*"know" + 0.015*"uncle" + 0.014*"think" + 0.014*"wo" + 0.012*"nephew" + 0.012*"look"'),
 (5,
  '0.027*"say" + 0.024*"man" + 0.021*"know" + 0.019*"mind" + 0.016*"suppose" + 0.015*"good" + 0.013*"see" + 0.013*"day" + 0.013*"young" + 0.013*"miss"'),
 (6,
  '0.017

In [19]:
lda_model.get_document_topics(corpus[0])

[(4, 0.05497697),
 (5, 0.22241381),
 (6, 0.35723075),
 (7, 0.09704776),
 (11, 0.2490883)]

In [20]:
# visualize
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis