In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('concatenated_reviews.csv')

In [3]:
# check for duplicates in id column
df['id'].duplicated().sum()

13

In [4]:
# drop duplicates
df.drop_duplicates(subset='id', inplace=True)

In [5]:
# check for null values in body column
df['body'].isnull().sum()

1

In [6]:
#check for null values in title column
df['title'].isnull().sum()

0

In [7]:
df['title_body'] = df['title'] + ' ' + df['body']

In [8]:
# string
docs = df['title_body'].astype(str)

In [9]:
# merge all documents into one string
all_docs = ' '.join(docs)

# data type all_docs
type(all_docs)

str

Preprocessing with SPACY

In [10]:
import spacy
# the Spacy model
nlp = spacy.load('en_core_web_sm')

In [11]:
# tokenize the text
doc = nlp(all_docs)

In [12]:
# remove stop words
tokens = [token for token in doc if not token.is_stop]

In [13]:
# lemmatize the text
lemmas = [token.lemma_ for token in tokens]

In [14]:
# punctuation and non-alphabetic characters
words = [lemma for lemma in lemmas if lemma.isalpha()]

In [15]:
# lower case
words = [word.lower() for word in words]

In [16]:
# remove words that are not in the Spacy vocabulary
words = [word for word in words if nlp.vocab[word].is_stop == False]

In [17]:
# remove everthing that is not a word
words = [word for word in words if nlp.vocab[word].is_punct == False]

In [18]:
words = [word for word in words if nlp(word)[0].pos_ == 'NOUN']

In [19]:
# remove high frequency words
words = [word for word in words if word not in nlp.Defaults.stop_words]

In [20]:
# remove the word 'review' and 'book'
words = [word for word in words if word not in ['review', 'book', 'love']]

In [21]:
from gensim import corpora, models
from gensim.models import CoherenceModel

In [22]:
# dictionary 
dictionary = corpora.Dictionary([words])

In [23]:
# corpus
corpus = [dictionary.doc2bow(words) for word in [words]]

In [24]:
# Calculate coherence scores for different numbers of topics
coherence_scores = []
for num_topics in range(2, 50, 2):
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    coherence_model = CoherenceModel(model=lda_model, texts=[words], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

# Find the number of topics with the highest coherence score
optimal_num_topics = range(2, 50, 2)[coherence_scores.index(max(coherence_scores))]

# Create the LDA model with the optimal number of topics
lda_model = models.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=30)

In [25]:
# visualize the topics
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


Evaluation

In [26]:
# coherence_scores of the optimal number of topics
coherence_model = CoherenceModel(model=lda_model, texts=[words], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(coherence_score)


0.29364156044194245
