In [None]:
import os
import requests
import nltk
import string
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt_tab')

# Define categories and sample websites
categories = {
    "Technology": ["https://www.theverge.com/tech", "https://techcrunch.com/", "https://www.wired.com/category/tech/"],
    "Science": ["https://www.sciencenews.org/", "https://www.scientificamerican.com/", "https://www.nature.com/"]
    # Add more categories and sources as needed, I added less for shorter runtime
}

# Directory to store text files
os.makedirs("scraped_texts", exist_ok=True)

def clean_text(text):
    text = text.replace("\n", " ")  # Remove newlines
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join(text.split())  # Remove extra spaces
    return text

def scrape_articles(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all paragraphs
        paragraphs = soup.find_all("p")
        content = ' '.join([p.get_text() for p in paragraphs])
        return clean_text(content)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return ""

def save_text(category, text):
    with open(f"scraped_texts/{category}.txt", "w", encoding="utf-8") as f:
        f.write(text)

def summarize_text(text, num_sentences=5):
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text  # Return full text if it's already short

    # Vectorize sentences using TF-IDF
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)

    # Compute similarity matrix
    similarity_matrix = cosine_similarity(sentence_vectors, sentence_vectors)
    scores = similarity_matrix.sum(axis=1)

    # Get top-ranked sentences
    ranked_sentences = [sentences[i] for i in np.argsort(scores)[-num_sentences:]]
    return ' '.join(ranked_sentences)

# Scrape and summarize
for category, urls in categories.items():
    print(f"Scraping category: {category}...")
    all_text = ""
    for url in urls:
        text = scrape_articles(url)
        all_text += text + "\n\n"
    save_text(category, all_text)

    # Summarize
    summary = summarize_text(all_text)
    print(f"Summary for {category}:\n", summary, "\n")
