In [1]:
# import concatenated reviews data from file and store in a dataframe
import pandas as pd
df = pd.read_csv('concatenated_reviews.csv')

In [2]:
df['title_body'] = df['title'] + ' ' + df['body']

In [3]:
docs = df['title_body'].astype(str).tolist()

source: https://radimrehurek.com/gensim_3.8.3/parsing/preprocessing.html

In [4]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_non_alphanum

In [5]:
docs = [strip_tags(doc) for doc in docs]
docs = [strip_non_alphanum(doc) for doc in docs]
docs = [strip_multiple_whitespaces(doc) for doc in docs]
docs = [strip_short(doc, minsize=3) for doc in docs]
docs = [strip_numeric(doc) for doc in docs]
docs = [remove_stopwords(doc) for doc in docs]
docs = [strip_punctuation(doc) for doc in docs]
docs = [preprocess_string(doc) for doc in docs]

In [6]:
# create a dictionary from the documents
from gensim import corpora
dictionary = corpora.Dictionary(docs)

In [7]:
# create a corpus from the documents
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [8]:
from gensim.models import LdaModel
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=15)

In [9]:
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.021*"book" + 0.015*"feyr" + 0.015*"tamlin" + 0.014*"like" + 0.010*"law"')
(1, '0.062*"dad" + 0.040*"gift" + 0.037*"joke" + 0.025*"book" + 0.020*"good"')
(2, '0.056*"book" + 0.035*"read" + 0.015*"love" + 0.013*"charact" + 0.012*"like"')
(3, '0.029*"book" + 0.017*"love" + 0.015*"mile" + 0.012*"like" + 0.012*"tate"')
(4, '0.021*"book" + 0.008*"great" + 0.007*"que" + 0.006*"time" + 0.006*"read"')
(5, '0.026*"book" + 0.015*"feyr" + 0.013*"love" + 0.012*"court" + 0.010*"charact"')
(6, '0.040*"love" + 0.030*"book" + 0.020*"read" + 0.018*"feyr" + 0.015*"charact"')
(7, '0.061*"book" + 0.031*"great" + 0.018*"read" + 0.012*"help" + 0.011*"parent"')
(8, '0.020*"book" + 0.016*"love" + 0.008*"bluei" + 0.008*"feyr" + 0.008*"charact"')
(9, '0.029*"book" + 0.013*"milli" + 0.013*"nina" + 0.011*"read" + 0.008*"stori"')


In [10]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(coherence_score)

0.31242447248070115
