In [5]:
import time
import requests
import nest_asyncio
import asyncio
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

USER_KEYWORDS = []  # User can specify custom keywords here
AI_KEYWORDS = ["Gen-AI", "Generative AI", "Artificial Intelligence", "Machine Learning"] + USER_KEYWORDS

# Manually defined stopwords to avoid NLTK download issues
STOPWORDS = {"the", "and", "is", "in", "to", "of", "for", "on", "with", "at", "a", "an"}

nest_asyncio.apply()

def fetch_page(url):
    """Fetches a webpage using requests."""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.text  # Return HTML content
        else:
            print(f"⚠️ Failed to fetch {url}. Status code: {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"❌ Error fetching {url}: {e}")
        return None

In [6]:
def extract_links(html, base_url):
    """Extracts all valid links from a webpage."""
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    
    for a_tag in soup.find_all("a", href=True):
        full_url = requests.compat.urljoin(base_url, a_tag["href"])  # Convert relative URLs to absolute
        links.add(full_url)
    
    return links


In [7]:
def filter_illinois_links(links):
    """Keeps only links that belong to illinois.edu and its subdomains."""
    """Keeps only links that belong to illinois.edu and all its subdomains."""
    """Keeps only links that belong to illinois.edu domain."""
    return {link for link in links if requests.utils.urlparse(link).netloc.endswith(".illinois.edu") or requests.utils.urlparse(link).netloc == "illinois.edu"}


def contains_keywords(html, keywords, min_count=3):
    """Extracts meaningful text and checks for AI-related keywords with improved accuracy."""
    soup = BeautifulSoup(html, "html.parser")
    content_blocks = soup.find_all(["article", "section", "div", "p", "h1", "h2"], class_=True)
    text = " ".join(block.get_text() for block in content_blocks)
    
    # Clean and normalize text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    words = text.lower().split()  # Tokenize text
    words = [word for word in words if word not in STOPWORDS]  # Remove stopwords
    
    # Count keyword occurrences
    matched_keywords = {kw: sum(1 for word in words if kw.lower() in word) for kw in keywords}
    print(f"🔍 Checking AI content on page... Found matches: {matched_keywords}")
    
    return sum(matched_keywords.values()) >= min_count


In [8]:
def crawl_illinois(start_urls, user_keywords=None, max_pages=100):
    """Crawls the Illinois website, looking for AI-related news articles."""
    visited = set()
    to_visit = set(start_urls) if isinstance(start_urls, list) else {start_urls}
    subdomains_visited = set()
    relevant_articles = []
    
    with tqdm(total=max_pages, desc="Crawling pages") as pbar:
        while to_visit and len(visited) < max_pages:
            if not to_visit:
                break
            url = to_visit.pop()
            print(f'🔍 Scanning: {url}')  # Show the webpage being scanned
            
            if url in visited:
                continue
            
            visited.add(url)
            pbar.update(1)

            html = fetch_page(url)
            if not html:
                continue

            links = extract_links(html, url)
            illinois_links = filter_illinois_links(links)
            to_visit.update(illinois_links - visited)
            
            for link in illinois_links:
                domain = requests.utils.urlparse(link).netloc
                if domain.endswith(".illinois.edu") and domain not in subdomains_visited:
                    subdomains_visited.add(domain)
                    to_visit.add(link)
                elif link not in visited:
                    to_visit.add(link)
            
            if contains_keywords(html, AI_KEYWORDS if not user_keywords else user_keywords):
                relevant_articles.append(url)
                print(f"🔹 Found AI article: {url}")

            time.sleep(1)  # Be polite, don't overload the server
    
    print("\n✅ Crawling complete!")
    return relevant_articles
# Ask user if they want to add more websites
def get_user_input(prompt):
    user_input = input(prompt).strip()
    return user_input if user_input.lower() != 'no' else None

extra_sites = get_user_input("Would you like to add more websites? Enter them separated by commas or type 'no': ")
if extra_sites:
    user_defined_sites = [site.strip() for site in extra_sites.split(',')]
else:
    user_defined_sites = ["https://illinois.edu"]

# Ask user if they want to add more keywords
extra_keywords = get_user_input("Would you like to add more keywords? Enter them separated by commas or type 'no': ")
if extra_keywords:
    user_defined_keywords = [kw.strip() for kw in extra_keywords.split(',')]
else:
    user_defined_keywords = ["Gen-AI", "Generative AI", "Artificial Intelligence", "Machine Learning", "AI"]

# Start crawling
user_defined_sites = ["https://illinois.edu"]  # Users can specify multiple sites here
user_defined_keywords = ["Deep Learning", "Neural Networks"]  # Users can add their own keywords

news_articles = crawl_illinois(user_defined_sites, user_defined_keywords, max_pages=50)

# Print results
print("\n📌 AI-Related Articles Found:")
for article in news_articles:
    print(article)


Crawling pages:   0%|          | 0/50 [00:00<?, ?it/s]

🔍 Scanning: https://illinois.edu
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:   4%|▍         | 2/50 [00:01<00:37,  1.30it/s]

🔍 Scanning: http://library.illinois.edu
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:   6%|▌         | 3/50 [00:03<00:53,  1.15s/it]

🔍 Scanning: http://grad.illinois.edu/admissions/apply
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:   8%|▊         | 4/50 [00:04<01:00,  1.32s/it]

🔍 Scanning: https://forms.illinois.edu/sec/887006
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:  10%|█         | 5/50 [00:06<01:03,  1.42s/it]

🔍 Scanning: http://grad.illinois.edu/diversity/enduring-transfer-pathways-graduate-education-stem
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:  12%|█▏        | 6/50 [00:07<01:04,  1.46s/it]

🔍 Scanning: https://www.igb.illinois.edu/
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:  14%|█▍        | 7/50 [00:09<01:03,  1.48s/it]

🔍 Scanning: http://grad.illinois.edu/professional-development/communication-skills
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:  16%|█▌        | 8/50 [00:11<01:02,  1.50s/it]

🔍 Scanning: https://www.library.illinois.edu/borrowing/overdue-and-lost-items/
🔍 Checking AI content on page... Found matches: {'Deep Learning': 0, 'Neural Networks': 0}


Crawling pages:  16%|█▌        | 8/50 [00:12<01:06,  1.59s/it]


KeyboardInterrupt: 