# Topic Analysis - Project Assignment Group 3 

In [None]:
from datasets import load_dataset, get_dataset_config_names
import pyLDAvis
import pandas as pd 
import spacy
from tqdm.auto import tqdm
import random
from sklearn.model_selection import train_test_split
import pycountry
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import pickle 
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS 
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models as gensimvis

from sklearn.metrics import classification_report, confusion_matrix

### Dataset Inspection and Content Extraction 


In the first part of our Topic Analysis section of the project, we will extract all the sections in the database and count them to see what each news article is assigned to. In this way, we can choose our topics and see what we are first counting.

We will notice that the _section_ column in the dataset is not helpful, since it groups the articles in a sparse way and does not provide useful labels for our topic classification.

Thus, we will have to preprocess each article by first extracting the _content_ column from each (ignoring all the others) In this way we will feed our LDA model for topic analysis. 

In [None]:
subfolders = get_dataset_config_names("RealTimeData/bbc_news_alltime")

all_sections = []

for month in subfolders:
    dataset = load_dataset("RealTimeData/bbc_news_alltime", month, split="train")
    if "section" in dataset.column_names:
        all_sections.extend(dataset["section"])

section_df = pd.DataFrame(all_sections, columns=["section"])
section_counts = section_df["section"].value_counts(dropna=False).reset_index()
section_counts.columns = ["Topic", "Count"]


In [None]:
section_counts = section_df['section'].value_counts(dropna=False).reset_index()
section_counts.columns = ['Topic', 'Count']


pd.set_option('display.max_rows', None)
print(section_counts)

Loading SpaCy, getting the dataset and splitting between test and train folders (70/30)

In [None]:
subfolders = get_dataset_config_names("RealTimeData/bbc_news_alltime")
preprocessing = spacy.load("en_core_web_sm", disable=["ner", "parser"])
preprocessing.add_pipe("sentencizer")
random.seed(42)
selected_months = random.sample(subfolders, k=20)

#Splitting the folders in train and test sets based on months folders 
train_months, test_months = train_test_split(selected_months, test_size=0.3, random_state=42)

In [None]:
train_sentences = []

# Looping over the months folders
for month in tqdm(train_months, desc="Month"): 
    dataset = load_dataset("RealTimeData/bbc_news_alltime", month, split="train")
    # Looping over the various articles
    content = [article["content"] for article in dataset] 
    # Getting the Doc object of the article and looping over them splittin the sentences
    for doc in preprocessing.pipe(content, batch_size=32):
        for sent in doc.sents:
            text = sent.text.strip()
            if text:
                print(text)
                train_sentences.append(text)

print(train_sentences[:5])

In [6]:
for x in train_sentences[:5]:
    print(x)

Theresa May was in Poland to sign a defence treaty with the country

Theresa May has sought to reassure Polish people living in the UK that they are still welcome after Brexit.
Speaking on a trip to Warsaw to sign a new defence treaty with the country, the PM said the one million Polish residents were a "strong part of [UK] society".
She promised a "simple" and "easy" process to get "settled status" to remain after the UK leaves the EU.
The trip comes after Mrs May sacked one of her closest allies, Damian Green.
She asked him to leave after he made "misleading" statements about claims pornography was found on his parliamentary computer.


In [None]:
# Now we are applying the same procedure for the test set in the test_months folder 
test_sentences = []

for month in tqdm(test_months, desc="Months Analyzed"): 
    dataset = load_dataset("RealTimeData/bbc_news_alltime", month, split="train")
    # Looping over the various articles
    content = [article["content"] for article in dataset] 
    # Getting the Doc object of the article and looping over them splittin the sentences
    for doc in preprocessing.pipe(content, batch_size=32):
        for sent in doc.sents:
            text = sent.text.strip()
            if text:
                test_sentences.append(text)

Months Analyzed:   0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
for x in test_sentences[:5]:
    print(x)

Amiram Cooper, Guy Gilboa-Dalal and Tsachi Idan are being held in Gaza

Israel says 129 people remain unaccounted for after they were abducted and taken to Gaza during the October 7 attacks by Hamas.
Of these, Israel says that 22 are believed to be dead.
A group representing the families of hostages says that Gadi Haggai, 73, is now believed to have died in Gaza.
An estimated 240 people were taken prisoner, but 105 were later released by Hamas during a six-day ceasefire at the end of November.
These are the stories of those hostages who are still being held, which have either been confirmed by the BBC or credibly reported.


### Data Preprocessing using SpaCy

In this **preprocessing** stage, we transform raw sentences into clean, *lemmatized* tokens optimized for topic modeling. Specifically, we use **SpaCy** to tokenize and lemmatize each sentence, removing stopwords, punctuation, short words, and numbers. 

We also exclude tokens based on their linguistic roles (pronouns, determiners, prepositions, auxiliaries) and named entities (people, organizations, locations, dates, times). Then, we filter out domain-specific noise such as media sources ("BBC", "Reuters"), common journalistic fillers ("said", "today"), possessive pronouns, and country names using a custom exclusion list. 

The resulting cleaned tokens are then structured into a **Gensim-compatible dictionary and corpus**, ready for training the LDA model. 
This also takes into account the labels in the test set provided for the project. In fact, our goal is to align as closely as possible to the topics shown in that test set. Thus, we cut out all the noise we think we may encounter.

In [23]:
token_pos_ner = spacy.load("en_core_web_sm", disable=["parser"])
stopwords = token_pos_ner.Defaults.stop_words

countries_list= {country.name.lower() for country in pycountry.countries}

# Excluding some words that are indeed part of newspapers articles but not necessary
custom_stop = {
    "bbc","reuters","cnn","guardian","nytimes","telegraph","aljazeera",
    "news","press","article","media","coverage","broadcast","report","headline",
    "says","said","told","claim","claimed","statement",
    "thing","stuff","someone","anyone","everyone","something","everything",
    "kind","sort","part","place","area","around"
}

# Unifying the various exclusions we listed above (costumized)
all_exclusions = stopwords.union(custom_stop).union(countries_list)
pos_exclusions  = {"PRON","DET","ADP","AUX","INTJ"}
entities_excl   = {"PERSON","ORG","GPE","LOC","DATE","TIME"}


def preprocessing(sentence):
    """
    This function preprocesses sentences coming from test/train lists. 
    It removes the PoS tags adn entity labels listed above. It also removes unecessary words 
    that are not helpful for our topic analysis
    """

    token_lists, cleaned_strings = [], []

    for i in tqdm(range(0, len(train_sentences), 500), desc="Sentence batches"):

        batch_sents = train_sentences[i : i + 500]

        for article in token_pos_ner.pipe(batch_sents, batch_size=16): 
            tokens = [token.lemma_.lower() for token in article if token.lemma_.lower() not in all_exclusions
                and len(token) > 3
                and not token.is_punct
                and not token.like_num
                and token.pos_ not in pos_exclusions
                and token.ent_type_ not in entities_excl]
            token_lists.append(tokens)
            cleaned_strings.append(" ".join(tokens))

    return token_lists, cleaned_strings

def preprocessing_test(sentence):
    """
    This function preprocesses sentences coming from test/train lists. 
    It removes the PoS tags adn entity labels listed above. It also removes unecessary words 
    that are not helpful for our topic analysis
    """

    token_lists, cleaned_strings = [], []

    for i in tqdm(range(0, len(test_sentences), 500), desc="Sentence batches"):

        batch_sents = test_sentences[i : i + 500]

        for article in token_pos_ner.pipe(batch_sents, batch_size=16): 
            tokens = [token.lemma_.lower() for token in article if token.lemma_.lower() not in all_exclusions
                and len(token) > 3
                and not token.is_punct
                and not token.like_num
                and token.pos_ not in pos_exclusions
                and token.ent_type_ not in entities_excl]
            token_lists.append(tokens)
            cleaned_strings.append(" ".join(tokens))

    return token_lists, cleaned_strings

In [None]:
train_tokens, train_tokens_string = preprocessing(train_sentences)


In [24]:
test_tokens, test_tokens_string = preprocessing_test(test_sentences)

Sentence batches:   0%|          | 0/443 [00:00<?, ?it/s]

### Checkpoint 

(The Variable is too heavy to reload the process every time, so we saved the variables in a pkl file we can use it again)

In [None]:
# Saving the variables for later usage, in order to not loose the preprocessing 
with open("train_processed.pkl", "wb") as f: 
    pickle.dump({"tokens": train_tokens, 
                 "strings": train_tokens_string}, f) 
    
# Saving the variables for later usage, in order to not loose the preprocessing 
with open("test_processed.pkl", "wb") as f: 
    pickle.dump({"tokens": test_tokens, 
                 "strings": test_tokens_string}, f)

In [None]:
with open("train_processed.pkl", "rb") as f:
    data = pickle.load(f)

train_tokens = data["tokens"]
train_token_strings = data["strings"]

with open("test_processed.pkl", "rb") as f:
    data = pickle.load(f)

train_tokens = data["tokens"]
train_token_strings = data["strings"]

In [19]:
for sentence in (train_tokens[:5]): 
    print(sentence)

['theresa', 'sign', 'defence', 'treaty', 'country', 'theresa', 'seek', 'reassure', 'polish', 'people', 'live', 'welcome']
['speak', 'trip', 'sign', 'defence', 'treaty', 'country', 'polish', 'resident', 'strong', 'society']
['promise', 'simple', 'easy', 'process', 'settle', 'status', 'remain', 'leave']
['trip', 'come', 'sack', 'close', 'ally']
['ask', 'leave', 'misleading', 'pornography', 'find', 'parliamentary', 'computer']


### Training LDA model

In [None]:
dictionary = corpora.Dictionary(train_tokens)
dictionary.filter_extremes(no_below=10, no_above=0.5)
bow_dict = [dictionary.doc2bow(doc) for doc in train_tokens]

lda_model = models.LdaModel(corpus = bow_dict, id2word = dictionary, num_topics = 10, passes = 10, random_state = 42, eval_every = 1, per_word_topics   = True)

for tid, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {tid}: {topic}")


2025-05-31 08:45:50,887 : INFO : -8.561 per-word bound, 377.6 perplexity estimate based on a held-out corpus of 2000 documents with 14985 words
2025-05-31 08:45:50,888 : INFO : PROGRESS: pass 3, at document #532000/625775
2025-05-31 08:45:51,106 : INFO : merging changes from 2000 documents into a model of 625775 documents
2025-05-31 08:45:51,106 : INFO : topic #9 (0.100): 0.025*"right" + 0.014*"court" + 0.012*"display" + 0.012*"death" + 0.011*"post" + 0.011*"strike" + 0.011*"shot" + 0.011*"president" + 0.010*"chief" + 0.010*"attempt"
2025-05-31 08:45:51,123 : INFO : topic #4 (0.100): 0.097*"play" + 0.083*"need" + 0.073*"video" + 0.062*"want" + 0.060*"browser" + 0.050*"use" + 0.050*"policy" + 0.047*"enable" + 0.031*"external" + 0.031*"cookie"
2025-05-31 08:45:51,125 : INFO : topic #0 (0.100): 0.076*"content" + 0.051*"site" + 0.047*"responsible" + 0.045*"continue" + 0.044*"external" + 0.041*"view" + 0.024*"accept" + 0.022*"miss" + 0.022*"choose" + 0.019*"original"
2025-05-31 08:45:51,126

In [None]:
%pip install pyLDAvis

In [None]:
# --------------------------------------------------
# 1.  Pretty-print top words per topic
# --------------------------------------------------
def show_top_words(model, dictionary, n_words=10):
    print("TOP WORDS PER TOPIC\n" + "-"*30)
    for tid, topic in model.show_topics(num_topics=model.num_topics,
                                        num_words=n_words,
                                        formatted=False):
        words = ", ".join([w for w, p in topic])
        print(f"Topic {tid:>2} ▶ {words}")

show_top_words(lda_model, dictionary, n_words=12)

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)
vis