In [5]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import string
import time

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Function to scrape homepage and extract article links
def get_article_links():
    url = "https://www.fox5dc.com/"
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve homepage.")
        return []
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if re.search(r'/news/.+', href):
            full_url = href if href.startswith('http') else f'https://www.fox5dc.com{href}'
            links.append(full_url)
    return list(set(links))

# scrape article function
def scrape_article(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print("Failed to fetch:", url)
            return None
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # extract the title
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"
        
        # extract article text 
        paragraphs = soup.find_all('p')
        text = " ".join(p.get_text(strip=True) for p in paragraphs)
        return {"url": url, "title": title, "text": text}
    except Exception as e:
        print("Error scraping", url, ":", e)
        return None

# get links from homepage
article_links = get_article_links()
print("Found {} article links.".format(len(article_links)))

# Scrape each article and store results in a list
articles = []
for url in article_links:
    print("Scraping:", url)
    article = scrape_article(url)
    if article:
        articles.append(article)
    time.sleep(1)

print("Scraped {} articles.".format(len(articles)))


df = pd.DataFrame(articles)

# extract topics using TF-IDF

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(tfidf_row, top_n=5):
    sorted_indices = tfidf_row.toarray().flatten().argsort()[::-1]
    top_indices = sorted_indices[:top_n]
    return [feature_names[i] for i in top_indices]

# new column for topics 
df['topics'] = [", ".join(get_top_keywords(tfidf_matrix[i])) for i in range(tfidf_matrix.shape[0])]


output_csv = "fox5_articles.csv"
df.to_csv(output_csv, index=False)

[nltk_data] Downloading package punkt to /home/mpritch1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mpritch1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Found 60 article links.
Scraping: https://www.fox5dc.com/news/61-year-old-good-samaritan-shot-killed-off-duty-fbi-agent-stafford-county-police
Scraping: https://www.fox5dc.com/news/nih-research-cuts-impact-medical-advancements
Scraping: https://www.fox5dc.com/news/dmv-leaders-share-resources-federal-job-cuts-continue
Scraping: https://www.fox5dc.com/news/montgomery-county-community-pushing-back-plan-open-rehab-facility-near-elementary-school
Scraping: https://www.fox5dc.com/news/elon-musk-postal-service-amtrak-privatized
Scraping: https://www.fox5dc.com/news/new-dolly-parton-song-carl-dean-tribute
Scraping: https://www.fox5dc.com/news/spacex-starship-explodes-during-8th-flight-test-video
Scraping: https://www.fox5dc.com/news/ceos-tariff-price-hikes-full-list
Scraping: https://www.fox5dc.com/news/2-dozen-displaced-montgomery-village-townhome-fire
Scraping: https://www.fox5dc.com/news/trump-sign-executive-order-take-aim-public-service-loan-forgiveness
Scraping: https://www.fox5dc.com/new