In [1]:
# Import libraries for this question here
import pandas as pd
import numpy as np
from collections import Counter # To count rows
import re

In [4]:

news_data = pd.read_csv('D:/ML/abcnews-date-text.csv')

# Print the data frames shape
print(news_data.shape)
# We can see that the data frame is 2 columns with 1,244,184 rows

(1244184, 2)


In [5]:
news_data.head(20)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
5,20030219,ambitious olsson wins triple jump
6,20030219,antic delighted with record breaking barca
7,20030219,aussie qualifier stosur wastes four memphis match
8,20030219,aust addresses un security council over iraq
9,20030219,australia is locked into war timetable opp


In [6]:
# Find the date range
print("Recent Date:", news_data['publish_date'].max())
print("Oldest Date:", news_data['publish_date'].min())

# Data ranges from 19/02/2003 to 31/12/2021

Recent Date: 20211231
Oldest Date: 20030219


In [7]:
# Check to see if there are any unsual date entries
news_data['publish_date'].apply(lambda x: len(str(x))).value_counts()
# All publish_date entries are 8 digits long

publish_date
8    1244184
Name: count, dtype: int64

In [8]:
# Convert the 'publish_date' column to datetime format
news_data['publish_date'] = pd.to_datetime(news_data['publish_date'], format='%Y%m%d')

# Convert the 'publish_date' column to the desired format 'dd/mm/yyyy'
news_data['publish_date'] = news_data['publish_date'].dt.strftime('%d/%m/%Y')

news_data.head()

Unnamed: 0,publish_date,headline_text
0,19/02/2003,aba decides against community broadcasting lic...
1,19/02/2003,act fire witnesses must be aware of defamation
2,19/02/2003,a g calls for infrastructure protection summit
3,19/02/2003,air nz staff in aust strike for pay rise
4,19/02/2003,air nz strike to affect australian travellers


In [9]:
# Extract day of week (e.g., Monday)
news_data['day_of_week'] = pd.to_datetime(news_data['publish_date'], format='%d/%m/%Y').dt.strftime('%A')

# Extract day (e.g., 19)
news_data['day'] = news_data['publish_date'].apply(lambda x: x.split('/')[0])

# Replace numeric month values with month names
month_names = {
    '01': 'January', '02': 'February', '03': 'March', '04': 'April',
    '05': 'May', '06': 'June', '07': 'July', '08': 'August',
    '09': 'September', '10': 'October', '11': 'November', '12': 'December'
}
news_data['month'] = news_data['publish_date'].apply(lambda x: month_names[x.split('/')[1]])
# Extract year (e.g., 2021)
news_data['year'] = news_data['publish_date'].apply(lambda x: x.split('/')[2])

news_data.head()

Unnamed: 0,publish_date,headline_text,day_of_week,day,month,year
0,19/02/2003,aba decides against community broadcasting lic...,Wednesday,19,February,2003
1,19/02/2003,act fire witnesses must be aware of defamation,Wednesday,19,February,2003
2,19/02/2003,a g calls for infrastructure protection summit,Wednesday,19,February,2003
3,19/02/2003,air nz staff in aust strike for pay rise,Wednesday,19,February,2003
4,19/02/2003,air nz strike to affect australian travellers,Wednesday,19,February,2003


In [10]:
# Add a new column 'season'
# Define the mapping of months to seasons
month_to_season = {
    'December': 'Summer','January': 'Summer','February': 'Summer',
    'March': 'Autumn','April': 'Autumn','May': 'Autumn',
    'June': 'Winter','July': 'Winter','August': 'Winter',
    'September': 'Spring','October': 'Spring','November': 'Spring'
}

# Add a new column 'season' based on the 'month' column
news_data['season'] = news_data['month'].apply(lambda x: month_to_season.get(x))

news_data.head()

Unnamed: 0,publish_date,headline_text,day_of_week,day,month,year,season
0,19/02/2003,aba decides against community broadcasting lic...,Wednesday,19,February,2003,Summer
1,19/02/2003,act fire witnesses must be aware of defamation,Wednesday,19,February,2003,Summer
2,19/02/2003,a g calls for infrastructure protection summit,Wednesday,19,February,2003,Summer
3,19/02/2003,air nz staff in aust strike for pay rise,Wednesday,19,February,2003,Summer
4,19/02/2003,air nz strike to affect australian travellers,Wednesday,19,February,2003,Summer


In [13]:
# Import necessary libraries
import matplotlib.pyplot as plt
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim import models
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

#import pyLDAvis.gensim_models as gensimvis

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phani\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
# Tokenise the headlines and remove punctuation and stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

# Create a list
texts = []
for headline in news_data['headline_text']:
    tokens = tokenizer.tokenize(headline.lower())
    tokens = [word for word in tokens if word not in stop_words]
    texts.append(tokens)
    
# Create a dictionary mapping words to their integer IDs
dictionary = corpora.Dictionary(texts)

# Create a document-term matrix (corpus)
corpus = [dictionary.doc2bow(text) for text in texts]

In [15]:
# Specify the number of topics
num_topics = 20

# Build the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)

In [16]:
# Document list
document = news_data.loc[0, 'headline_text'].split()

# Convert the document to a bag-of-words representation using the dictionary
doc_bow = dictionary.doc2bow(document)

# Get the topic distribution for the document
topic_distribution = lda_model.get_document_topics(doc_bow)

In [17]:
# Print the topics and their top words
for topic_num, topic_words in lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False):
    print(f"Topic {topic_num + 1}:")
    for word, prob in topic_words:
        prob_percentage = prob * 100
        print(f"{word}: {prob_percentage:.2f}%")
    print()

Topic 1:
new: 20.10%
court: 4.42%
live: 4.24%
charged: 3.53%
hotel: 3.18%
residents: 2.65%
children: 2.37%
industry: 2.29%
black: 2.20%
zealand: 1.98%

Topic 2:
queensland: 13.77%
death: 5.97%
scott: 5.22%
get: 3.26%
speaks: 3.06%
president: 2.96%
alleged: 2.87%
act: 2.72%
white: 1.85%
takes: 1.73%

Topic 3:
wa: 11.67%
house: 4.40%
north: 3.96%
australians: 3.58%
attack: 3.28%
child: 3.13%
war: 2.70%
future: 2.14%
bushfires: 2.04%
protesters: 2.03%

Topic 4:
us: 16.86%
program: 3.30%
young: 3.28%
warns: 2.30%
food: 2.12%
wednesday: 1.88%
makes: 1.80%
worker: 1.73%
plane: 1.60%
festival: 1.55%

Topic 5:
sydney: 10.46%
restrictions: 6.31%
year: 5.07%
dies: 2.75%
call: 2.51%
wins: 2.44%
darwin: 2.40%
old: 2.37%
emergency: 2.34%
support: 2.16%

Topic 6:
australia: 19.15%
government: 8.48%
election: 6.83%
minister: 4.45%
set: 2.17%
second: 2.04%
deal: 1.74%
numbers: 1.70%
trade: 1.68%
make: 1.60%

Topic 7:
police: 11.62%
man: 7.55%
murder: 3.54%
trial: 3.02%
test: 2.64%
island: 2.41%
christ

In [18]:
print("Topic Distribution for the Document:")
for topic, prob in topic_distribution:
    dist_percentage = prob * 100
    print(f"Topic {topic + 1}: {dist_percentage:.2f}%")

Topic Distribution for the Document:
Topic 1: 1.00%
Topic 2: 1.00%
Topic 3: 1.00%
Topic 4: 1.00%
Topic 5: 1.00%
Topic 6: 1.00%
Topic 7: 1.00%
Topic 8: 1.00%
Topic 9: 1.00%
Topic 10: 1.00%
Topic 11: 1.00%
Topic 12: 1.00%
Topic 13: 21.00%
Topic 14: 1.00%
Topic 15: 1.00%
Topic 16: 1.00%
Topic 17: 60.99%
Topic 18: 1.00%
Topic 19: 1.00%
Topic 20: 1.00%
