In [1]:
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset='all')['data']

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,0
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...
4,From: Alexander Samuel McDiarmid <am2o+@andrew...


In [5]:
docs = df[0].astype(str)

In [6]:
docs

0        From: Mamatha Devineni Ratnam <mr47+@andrew.cm...
1        From: mblawson@midway.ecn.uoknor.edu (Matthew ...
2        From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...
3        From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...
4        From: Alexander Samuel McDiarmid <am2o+@andrew...
                               ...                        
18841    From: jim.zisfein@factory.com (Jim Zisfein) \n...
18842    From: rdell@cbnewsf.cb.att.com (richard.b.dell...
18843    From: westes@netcom.com (Will Estes)\nSubject:...
18844    From: steve@hcrlgw (Steven Collins)\nSubject: ...
18845    From: chriss@netcom.com (Chris Silvester)\nSub...
Name: 0, Length: 18846, dtype: object

Preprocessing with SPACY

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
# tokenize
docs = docs.apply(nlp)

In [9]:
# stop words
from spacy.lang.en.stop_words import STOP_WORDS

# remove stop words
docs = docs.apply(lambda x: [token for token in x if not token.is_stop])

In [10]:
# lemmatize the text

docs = docs.apply(lambda x: [token.lemma_ for token in x])

In [11]:
docs

0        [:, Mamatha, Devineni, Ratnam, <, mr47+@andrew...
1        [:, mblawson@midway.ecn.uoknor.edu, (, Matthew...
2        [:, hilmi-er@dsv.su.se, (, Hilmi, Eren, ), \n,...
3        [:, guyd@austin.ibm.com, (, Guy, Dawson, ), \n...
4        [:, Alexander, Samuel, McDiarmid, <, am2o+@and...
                               ...                        
18841    [:, jim.zisfein@factory.com, (, Jim, Zisfein, ...
18842    [:, rdell@cbnewsf.cb.att.com, (, richard.b.del...
18843    [:, westes@netcom.com, (, Estes, ), \n, subjec...
18844    [:, steve@hcrlgw, (, Steven, Collins, ), \n, s...
18845    [:, chriss@netcom.com, (, Chris, Silvester, ),...
Name: 0, Length: 18846, dtype: object

In [12]:
# punctuation and non-alphabetic characters
docs = docs.apply(lambda x: [token for token in x if token.isalpha()])

In [13]:
# lower case
docs = docs.apply(lambda x: [token.lower() for token in x])

In [14]:
from gensim import corpora, models
from gensim.models import CoherenceModel

In [15]:
# dictionary 
dictionary = corpora.Dictionary(docs)

In [16]:
# corpus
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [17]:
# Calculate coherence scores for different numbers of topics
coherence_scores = []
for num_topics in range(2, 50, 2):
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

# Find the number of topics with the highest coherence score
optimal_num_topics = range(2, 50, 2)[coherence_scores.index(max(coherence_scores))]

# Create the LDA model with the optimal number of topics
lda_model = models.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=15)

In [18]:
# visualize the topics
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


Evaluation

In [19]:
# coherence_scores of the optimal number of topics
coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(coherence_score)

0.6064187211551243
