In [1]:
# 1. Import Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel

# Download NLTK Resources (only once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\60139\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\60139\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\60139\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# 2. Load the Data
documents = [
    "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
    "Rafael Nadal Is Out of the Australian Open",
    "Biden Announces Virus Measures",
    "Biden's Virus Plans Meet Reality",
    "Where Biden's Virus Plan Stands"
]

In [7]:
# 3. Preprocess the Data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize tokens
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [9]:
# 4. Create a Document-Term Matrix
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [11]:
# 5. Run LDA
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

In [15]:
# 6. Interpret Results
# Empty list to store dominant topic labels for each document
article_labels = []

# Iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
    # Convert document to bag-of-words
    bow = dictionary.doc2bow(doc)
    # Get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # Determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # Append dominant topic to list
    article_labels.append(dominant_topic)

# Create DataFrame
import pandas as pd
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df)
print()

Table with Articles and Topic:
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      1
1         Rafael Nadal Is Out of the Australian Open      1
2                     Biden Announces Virus Measures      0
3                   Biden's Virus Plans Meet Reality      0
4                    Where Biden's Virus Plan Stands      0



In [17]:
# Print the top terms for each topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "virus" (weight: 0.166)
- "biden" (weight: 0.166)
- "plan" (weight: 0.119)
- "reality" (weight: 0.071)
- "meet" (weight: 0.071)
- "announces" (weight: 0.071)
- "measure" (weight: 0.071)
- "stand" (weight: 0.071)
- "rafael" (weight: 0.024)
- "open" (weight: 0.024)

Topic 1:
- "nadal" (weight: 0.131)
- "open" (weight: 0.131)
- "rafael" (weight: 0.131)
- "missing" (weight: 0.079)
- "federer" (weight: 0.079)
- "roger" (weight: 0.079)
- "join" (weight: 0.079)
- "australian" (weight: 0.078)
- "stand" (weight: 0.027)
- "plan" (weight: 0.027)

