In [1]:
import re

import nltk
import numpy as np
import pandas as pd
import pyLDAvis
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/dsrivallabha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dsrivallabha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/dsrivallabha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 01 - Loading Data

In [3]:
# data source: Kaggle - https://www.kaggle.com/datasets/hgultekin/bbcnewsarchive
df = pd.read_csv("../Data/bbc-news-data.csv", sep="\t")

In [4]:
df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [5]:
df.shape

(2225, 4)

In [6]:
df["category"].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [7]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess(text):
    # remove punctuation and lowercase
    text = re.sub(r"[^a-zA-Z]", " ", text).lower()

    tokens = nltk.word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]

    return " ".join(tokens)


df["clean_content"] = df["content"].apply(preprocess)

In [8]:
df.head()

Unnamed: 0,category,filename,title,content,clean_content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,quarterly profit medium giant timewarner jumpe...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,dollar hit highest level euro almost three mon...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,owner embattled russian oil giant yukos ask bu...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,british airway blamed high fuel price drop pro...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,share drink food firm allied domecq risen spec...


In [9]:
# Parameters
# max_df: ignore words that occur in more than x% of documents
# min_df: ignore words that occur in less than n documents
# good combination: max_df=0.6, min_df=10
# poor combination: max_df=0.95, min_df=2
vectorizer = TfidfVectorizer(max_df=0.6, min_df=10)
tfidf = vectorizer.fit_transform(df["clean_content"])

In [10]:
num_topics = 4  # choose your number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tfidf)

0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",4
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [11]:
feature_names = vectorizer.get_feature_names_out()


def display_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic #{topic_idx+1}:")
        top_words = [feature_names[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]
        print("  " + ", ".join(top_words))


display_topics(lda, feature_names)


Topic #1:
  film, best, award, game, win, star, first, play, final, time

Topic #2:
  labour, election, party, would, government, blair, minister, tory, brown, england

Topic #3:
  bank, company, market, growth, economy, price, firm, share, sale, rate

Topic #4:
  people, mobile, phone, technology, music, service, game, user, computer, new


In [12]:
topic_probabilities = lda.transform(tfidf)  # doc-topic matrix

# Assign the most likely topic
df["topic"] = topic_probabilities.argmax(axis=1) + 1  # +1 for human-friendly numbering

print("\nDocument Topic Assignments:")
print(df[["clean_content", "topic"]])

# Optional: show full probability distribution
df["topic_distribution"] = topic_probabilities.tolist()


Document Topic Assignments:
                                          clean_content  topic
0     quarterly profit medium giant timewarner jumpe...      3
1     dollar hit highest level euro almost three mon...      3
2     owner embattled russian oil giant yukos ask bu...      3
3     british airway blamed high fuel price drop pro...      3
4     share drink food firm allied domecq risen spec...      3
...                                                 ...    ...
2220  introducing two initiative help beat rogue dia...      4
2221  computer user across world continue ignore sec...      4
2222  new european directive could put software writ...      4
2223  man making sure computer network safe secure r...      4
2224  online role playing game time consuming enthra...      4

[2225 rows x 2 columns]


In [13]:
df["topic"].unique(), df["topic"].value_counts()

(array([3, 4, 2, 1]),
 topic
 1    735
 4    544
 2    540
 3    406
 Name: count, dtype: int64)

In [14]:
df["topic_distribution"].head()

0    [0.026375726913369216, 0.027637034686733824, 0...
1    [0.024957544023003556, 0.02979484339554737, 0....
2    [0.03310239518880292, 0.03287670154434908, 0.9...
3    [0.030954041158642746, 0.026691277988652864, 0...
4    [0.03080422443002238, 0.10110175597792465, 0.7...
Name: topic_distribution, dtype: object

In [15]:
df.head()

Unnamed: 0,category,filename,title,content,clean_content,topic,topic_distribution
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,quarterly profit medium giant timewarner jumpe...,3,"[0.026375726913369216, 0.027637034686733824, 0..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,dollar hit highest level euro almost three mon...,3,"[0.024957544023003556, 0.02979484339554737, 0...."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,owner embattled russian oil giant yukos ask bu...,3,"[0.03310239518880292, 0.03287670154434908, 0.9..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,british airway blamed high fuel price drop pro...,3,"[0.030954041158642746, 0.026691277988652864, 0..."
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,share drink food firm allied domecq risen spec...,3,"[0.03080422443002238, 0.10110175597792465, 0.7..."


In [16]:
df[df["category"] == "business"]["topic"].value_counts()

topic
3    405
4     55
2     49
1      1
Name: count, dtype: int64

In [17]:
df[df["category"] == "sport"]["topic"].value_counts()

topic
1    397
2     93
4     21
Name: count, dtype: int64

In [18]:
from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score,
                             confusion_matrix)

In [19]:
results = pd.DataFrame(df.groupby("category").topic.agg("value_counts").reset_index())
results

Unnamed: 0,category,topic,count
0,business,3,405
1,business,4,55
2,business,2,49
3,business,1,1
4,entertainment,1,323
5,entertainment,4,55
6,entertainment,2,8
7,politics,2,389
8,politics,4,24
9,politics,1,3


In [20]:
results.pivot(index="category", columns="topic")["count"].reset_index()

topic,category,1,2,3,4
0,business,1.0,49.0,405.0,55.0
1,entertainment,323.0,8.0,,55.0
2,politics,3.0,389.0,1.0,24.0
3,sport,397.0,93.0,,21.0
4,tech,11.0,1.0,,389.0


In [21]:
def prepare_ldavis(lda_model, count_data, vectorizer):
    """
    Create the data structure required for pyLDAvis
    for scikit-learn LatentDirichletAllocation models.
    """
    vocab = vectorizer.get_feature_names_out()
    term_freq = np.asarray(count_data.sum(axis=0)).ravel()

    return pyLDAvis.prepare(
        topic_term_dists=lda_model.components_
        / lda_model.components_.sum(axis=1)[:, None],
        doc_topic_dists=lda_model.transform(count_data),
        doc_lengths=count_data.sum(axis=1).A.ravel(),
        vocab=vocab,
        term_frequency=term_freq,
    )

In [22]:
print("\nGenerating pyLDAvis visualization...")

panel = prepare_ldavis(lda, tfidf, vectorizer)

pyLDAvis.save_html(panel, "lda_visualization.html")

print("Saved visualization as lda_visualization.html")


Generating pyLDAvis visualization...


  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


Saved visualization as lda_visualization.html
