In [44]:
from transformers import CLIPProcessor, CLIPModel
import torch
import sqlite3
import time
from enum import Enum
from typing import List, Dict, Optional, Tuple
import logging
import mwclient
import hashlib
import urllib.parse

In [65]:
# Database handling
class EmbeddingsDatabase:
    def __init__(self, db_path: str = "wiki_embeddings.db"):
        self.db_path = db_path
        self.setup_database()
    
    def setup_database(self):
        """Create database tables."""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("""
                CREATE TABLE IF NOT EXISTS embeddings (
                    article_id TEXT PRIMARY KEY,
                    title TEXT,
                    url TEXT,
                    embedding BLOB,
                    processed_date TEXT,
                    hash TEXT
                )
            """)
            conn.execute("""
                CREATE TABLE IF NOT EXISTS failed_articles (
                    article_id TEXT PRIMARY KEY,
                    title TEXT,
                    error_message TEXT,
                    attempt_date TEXT
                )
            """)
    
    def store_embedding(self, article_id: str, title: str, url: str, embedding: bytes, hash_val: str):
        """Store a single embedding in the database."""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute(
                """
                INSERT OR REPLACE INTO embeddings 
                (article_id, title, url, embedding, processed_date, hash)
                VALUES (?, ?, ?, ?, datetime('now'), ?)
                """,
                (article_id, title, url, embedding, hash_val)
            )
    
    def store_failed_article(self, article_id: str, title: str, error: str):
        """Store information about failed article processing."""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute(
                """
                INSERT OR REPLACE INTO failed_articles 
                (article_id, title, error_message, attempt_date)
                VALUES (?, ?, ?, datetime('now'))
                """,
                (article_id, title, error)
            )

# Wikipedia API handling
class WikipediaArticleFetcher:
    def __init__(self, user_agent: str):
        self.site = mwclient.Site('en.wikipedia.org', clients_useragent=user_agent)
        self.logger = logging.getLogger(__name__)
    
    def get_vital_articles(self, level: int = 4) -> List[Dict]:
        """Get vital articles at specified level."""
        if not isinstance(level, int) or level < 1 or level > 5:
            raise ValueError("Level must be an integer between 1 and 5")
        
        article_titles = set()
        
        def process_article_lines(text: str, marker: str = '*') -> None:
            for line in text.split('\n'):
                if line.startswith(marker) and '[[' in line and ']]' in line:
                    start = line.find('[[') + 2
                    end = line.find(']]')
                    article_title = line[start:end]
                    if '|' in article_title:
                        article_title = article_title.split('|')[0]
                    article_titles.add(article_title)
        
        if level in [4, 5]:
            main_page = self.site.pages[f'Wikipedia:Vital articles/Level/{level}']
            vital_article_categories = []
            
            for line in main_page.text().split('\n'):
                if line.startswith('|') and '[[' in line and ']]' in line:
                    start = line.find('[[') + 2
                    end = line.find(']]')
                    category = line[start:end].split('|')[0]
                    vital_article_categories.append(category)
            
            for category in vital_article_categories:
                category_page = self.site.pages[category]
                process_article_lines(category_page.text(), marker='#')
        else:
            page = self.site.pages[f'Wikipedia:Vital articles/Level/{level}']
            process_article_lines(page.text())

        return self._convert_titles_to_articles(sorted(article_titles))
    
    def get_good_articles(self) -> List[Dict]:
        """
        Get all articles that have been marked as good articles from
        the page https://en.wikipedia.org/wiki/Wikipedia:Good_articles/all
        """

        good_article_base = 'Wikipedia:Good_articles'
        # start by getting all categories
        categories = []
        counter = 0
        page = self.site.pages[good_article_base + '/all']
        for line in page.text().split('\n'):
                #print(line)
                if line.startswith('*') and '[[' in line and ']]' in line:
                    start = line.find('[[') + 2
                    end = line.find(']]')
                    category = line[start:end].split('|')[1]
                    categories.append(category)

        # get all articles in the categories
        articles = []
        for category in categories:
            page = self.site.pages[f'{good_article_base}/{category}']
            for line in page.text().split('\n'):
                if '[[' in line and ']]' in line and '|' not in line:
                    start = line.find('[[') + 2
                    end = line.find(']]')
                    article = line[start:end]
                    articles.append(article)
                    counter += 1
                    if counter % 500 == 0:
                        print(f"Grabbed {counter} good article titles")
        
        return self._convert_titles_to_articles(sorted(articles))
    
    def get_a_class_articles(self) -> List[Dict]:
        """Get all A-Class articles."""
        articles = []
        counter = 0
        for category in self.site.categories['A-Class_articles'].members():
            if category.length > 0:
                for member in category.members():
                    title = member.name.split(':')[1]
                    articles.append(title)
                    counter += 1
                    if counter % 500 == 0:
                        print(f"Grabbed {counter} A-Class article titles")
        return self._convert_titles_to_articles(sorted(articles))

    def get_articles_from_titles(self, titles: List[str]) -> List[Dict]:
        """Get articles from a list of titles."""
        return self._convert_titles_to_articles(titles)
    
    def _convert_titles_to_articles(self, titles: List[str]) -> List[Dict]:
        """Convert titles to article dictionaries with metadata."""
        articles = []
        for i in range(0, len(titles), 50):
            batch_titles = titles[i:i + 50]
            result = self.site.api('query', 
                                format='json',
                                titles='|'.join(batch_titles),
                                redirects=1)
            
            if 'query' in result and 'pages' in result['query']:
                for page_id, page_data in result['query']['pages'].items():
                    if 'missing' not in page_data and 'invalid' not in page_data:
                        title = page_data['title']
                        title_encoded = urllib.parse.quote(title.replace(' ', '_'))
                        articles.append({
                            'id': str(page_id),
                            'title': title,
                            'url': f"https://en.wikipedia.org/wiki/{title_encoded}"
                        })
            time.sleep(0.01)
        return articles
    
    def get_article_content(self, articles: List[Dict]) -> List[Tuple[Dict, str, str]]:
        """Fetch content for a batch of articles."""
        results = []
        for i in range(0, len(articles), 50):
            chunk = articles[i:i + 50]
            titles = '|'.join(article['title'] for article in chunk)
            
            params = {
                'format': 'json',
                'prop': 'extracts',
                'exintro': True,
                'explaintext': True,
                'redirects': 1,
                'titles': titles
            }
            
            result = self.site.api('query', **params)
            
            if 'query' in result and 'pages' in result['query']:
                for page_id, page_data in result['query']['pages'].items():
                    matching_article = next(
                        (a for a in chunk if str(page_data.get('pageid')) == a['id']),
                        None
                    )
                    
                    if matching_article:
                        content = page_data.get('extract', '').strip()
                        if not content:
                            content = page_data.get('title', matching_article['title'])
                        
                        if content:
                            hash_val = hashlib.sha256(content.encode()).hexdigest()
                            results.append((matching_article, content, hash_val))
            
            time.sleep(0.01)
        
        return results

# Embedding generation
class EmbeddingGenerator:
    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
        self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        self.model = CLIPModel.from_pretrained(model_name).to(self.device)
        self.processor = CLIPProcessor.from_pretrained(model_name)
    
    def generate_embeddings(self, texts: List[str]) -> torch.Tensor:
        """Generate embeddings for a list of texts."""
        with torch.no_grad():
            inputs = self.processor(
                text=texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=77
            )
            text_features = self.model.get_text_features(
                **{k: v.to(self.device) for k, v in inputs.items()}
            )
            return text_features / text_features.norm(dim=-1, keepdim=True)


class ArticleSource(Enum):
    VITAL = "vital"
    GOOD = "good" 
    CUSTOM = "custom"
    A_CLASS = "a_class"

# Main orchestrator
class WikiEmbeddingsOrchestrator:
    def __init__(self, 
                 model_name: str = "openai/clip-vit-base-patch32",
                 db_path: str = "wiki_embeddings.db",
                 batch_size: int = 32,
                 user_agent: str = 'clip_embedding_wikipedia/0.0 (https://calebkruse.com/; caleb.krs@gmail.com)'
    ):
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('wiki_embeddings.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        # Initialize components
        self.batch_size = batch_size
        self.db = EmbeddingsDatabase(db_path)
        self.wiki_fetcher = WikipediaArticleFetcher(user_agent)
        self.embedding_generator = EmbeddingGenerator(model_name)

    def _get_articles_from_source(self, 
                                source: ArticleSource,
                                vital_level: Optional[int] = None,
                                custom_titles: Optional[List[str]] = None) -> List[Dict]:
        """Get articles based on the specified source type."""
        if source == ArticleSource.VITAL:
            if not vital_level or not (1 <= vital_level <= 5):
                raise ValueError("Vital articles require a level between 1 and 5")
            return self.wiki_fetcher.get_vital_articles(level=vital_level)
        
        elif source == ArticleSource.GOOD:
            return self.wiki_fetcher.get_good_articles()
        
        elif source == ArticleSource.CUSTOM:
            if not custom_titles:
                raise ValueError("Custom article source requires a list of titles")
            return self.wiki_fetcher.get_articles_from_titles(custom_titles)
        
        elif source == ArticleSource.A_CLASS:
            return self.wiki_fetcher.get_a_class_articles()
        
        raise ValueError(f"Unknown article source: {source}")

    def process_batch(self, articles: List[Dict]):
        """Process a batch of articles."""
        try:
            # Fetch article content
            article_data = self.wiki_fetcher.get_article_content(articles)
            
            if not article_data:
                self.logger.warning("No valid content found in batch")
                return
            
            # Split data
            valid_articles, contents, hashes = zip(*article_data)
            
            # Generate embeddings
            embeddings = self.embedding_generator.generate_embeddings(contents)
            
            # Store results
            for article, embedding, hash_val in zip(valid_articles, embeddings, hashes):
                self.db.store_embedding(
                    article['id'],
                    article['title'],
                    article['url'],
                    embedding.cpu().numpy().tobytes(),
                    hash_val
                )
                
        except Exception as e:
            self.logger.error(f"Batch processing error: {str(e)}")
            for article in articles:
                self.db.store_failed_article(article['id'], article['title'], str(e))

    def generate_embeddings(self,
                          source: ArticleSource,
                          vital_level: Optional[int] = None,
                          custom_titles: Optional[List[str]] = None):
        """Generate embeddings for articles from the specified source."""
        try:
            articles_to_process = self._get_articles_from_source(
                source=source,
                vital_level=vital_level,
                custom_titles=custom_titles
            )
            
            if not articles_to_process:
                self.logger.error("No articles found")
                return
            
            # Process in batches
            total_articles = len(articles_to_process)
            self.logger.info(f"Found {total_articles} articles to process")
            
            for i in range(0, total_articles, self.batch_size):
                batch = articles_to_process[i:i + self.batch_size]
                self.process_batch(batch)
                self.logger.info(f"Processed {min(i + self.batch_size, total_articles)}/{total_articles} articles")
                
        except KeyboardInterrupt:
            self.logger.info("Interrupted by user. Saving progress...")
        except Exception as e:
            self.logger.error(f"Fatal error: {str(e)}")

In [74]:
def get_a_class_articles():
    site = mwclient.Site('en.wikipedia.org', clients_useragent='clip_embedding_wikipedia/0.0 (https://calebkruse.com/; caleb.krs@gmail.com)')
    articles = []
    counter = 0
    for category in site.categories['A-Class_articles'].members():
        #print(category.name)
        if category.length > 0:
            for member in category.members():
                title = member.name.split(':')[1]
                articles.append(title)
                counter += 1
                if counter % 500 == 0:
                    print(f"Grabbed {counter} A-Class article titles")
    return articles

def get_good_class_articles():
    site = mwclient.Site('en.wikipedia.org', clients_useragent='clip_embedding_wikipedia/0.0 (https://calebkruse.com/; caleb.krs@gmail.com)')
    articles = []
    counter = 0
    for category in site.categories['GA-Class_articles'].members():
        if category.length > 0:
            for member in category.members():
                try:
                    title = member.name.split(':')[1]
                    articles.append(title)
                    counter += 1
                    if counter % 500 == 0:
                        print(f"Grabbed {counter} good article titles")
                except Exception as e:
                    print(f"Error processing {member.name}: {e}")
    return articles

In [75]:
good_articles = get_good_class_articles()

Grabbed 500 good article titles
Grabbed 1000 good article titles
Grabbed 1500 good article titles
Grabbed 2000 good article titles
Grabbed 2500 good article titles
Grabbed 3000 good article titles
Grabbed 3500 good article titles
Grabbed 4000 good article titles
Grabbed 4500 good article titles
Grabbed 5000 good article titles
Grabbed 5500 good article titles
Grabbed 6000 good article titles
Grabbed 6500 good article titles
Grabbed 7000 good article titles
Grabbed 7500 good article titles
Grabbed 8000 good article titles
Grabbed 8500 good article titles
Grabbed 9000 good article titles
Grabbed 9500 good article titles
Grabbed 10000 good article titles
Grabbed 10500 good article titles
Grabbed 11000 good article titles
Grabbed 11500 good article titles
Grabbed 12000 good article titles
Grabbed 12500 good article titles
Grabbed 13000 good article titles
Grabbed 13500 good article titles
Grabbed 14000 good article titles
Grabbed 14500 good article titles
Grabbed 15000 good article titles


In [76]:
orchestrator = WikiEmbeddingsOrchestrator()

# For vital articles
#orchestrator.generate_embeddings(source=ArticleSource.VITAL, vital_level=2)

# For good articles
#orchestrator.generate_embeddings(source=ArticleSource.GOOD)

# For A-Class articles
#orchestrator.generate_embeddings(source=ArticleSource.A_CLASS)

# For custom article list

#titles = ["Python (programming language)", "Machine learning", "Artificial intelligence"]
orchestrator.generate_embeddings(source=ArticleSource.CUSTOM, custom_titles=good_articles)

2025-02-02 10:36:32,913 - INFO - Found 191243 articles to process
2025-02-02 10:36:33,645 - INFO - Processed 32/191243 articles
2025-02-02 10:36:33,993 - INFO - Processed 64/191243 articles
2025-02-02 10:36:34,487 - INFO - Processed 96/191243 articles
2025-02-02 10:36:34,955 - INFO - Processed 128/191243 articles
2025-02-02 10:36:35,464 - INFO - Processed 160/191243 articles
2025-02-02 10:36:35,944 - INFO - Processed 192/191243 articles
2025-02-02 10:36:37,325 - INFO - Processed 224/191243 articles
2025-02-02 10:36:37,735 - INFO - Processed 256/191243 articles
2025-02-02 10:36:38,143 - INFO - Processed 288/191243 articles
2025-02-02 10:36:38,534 - INFO - Processed 320/191243 articles
2025-02-02 10:36:38,967 - INFO - Processed 352/191243 articles
2025-02-02 10:36:39,363 - INFO - Processed 384/191243 articles
2025-02-02 10:36:39,781 - INFO - Processed 416/191243 articles
2025-02-02 10:36:40,221 - INFO - Processed 448/191243 articles
2025-02-02 10:36:40,706 - INFO - Processed 480/191243 a

In [5]:
# Basic initialization
orchestrator = WikiEmbeddingsOrchestrator(
    model_name="openai/clip-vit-base-patch32",
    db_path="wiki_embeddings.db",
    batch_size=128
)

# Generate embeddings for vital articles (level 4)
orchestrator.generate_embeddings(vital_level=2)

2025-02-02 09:02:05,829 - INFO - Found 101 articles to process
2025-02-02 09:02:06,926 - INFO - Processed 101/101 articles


In [None]:
articles = set()

for year in range(2016, 2024):
    print(year)
    # get all the months with 31 days
    for month in [1,3,5,7,8,10,12]:
        top_articles = get_top_articles(1000, month, year)
        articles.update(top_articles['article'].tolist())
articles = list(articles)

In [None]:
generator = WikiEmbeddingsGenerator()
generator.generate_embeddings(articles=articles)

In [None]:
# Read and search
reader = WikiEmbeddingsReader()
similar = reader.get_similar_articles("computer mouse")
for article in similar:
    print(f"{article['title']}: {article['similarity']:.3f}")

In [None]:
# Initialize the reader
reader = ImageEmbeddingsReader()
image_url = "/Users/clkruse/Downloads/ginger_test.png"
# Find similar articles for an image
similar_articles = reader.get_similar_articles_by_image(image_url, limit=5)
similar_articles

In [28]:
import mwviews.api
import pandas as pd

def get_top_articles(limit=100, month=12, year=2024):
    # Initialize the PageviewsClient
    user_agent = 'clip_embedding_wikipedia/0.0 (https://calebkruse.com/; caleb.krs@gmail.com)'
    client = mwviews.api.PageviewsClient(user_agent=user_agent)
    
    # Get daily views for all articles
    views = client.top_articles('en.wikipedia', limit=limit, month=month,year=year)
    
    return pd.DataFrame(views)

In [None]:
articles = set()

for year in range(2016, 2024):
    print(year)
    # get all the months with 31 days
    for month in [1,3,5,7,8,10,12]:
        top_articles = get_top_articles(1000, month, year)
        articles.update(top_articles['article'].tolist())
articles = list(articles)
articles[:10]

In [None]:
#get embeddings for all the articles
embedder = WikiEmbeddingsGenerator()
embedder.generate_embeddings(articles=articles)

# New version

In [49]:
site = mwclient.Site('en.wikipedia.org', clients_useragent='clip_embedding_wikipedia/0.0 (https://calebkruse.com/; caleb.krs@gmail.com)')

good_article_base = 'Category:Articles by quality'
# start by getting all categories
categories = []
page = site.pages[good_article_base]
for line in page.text().split('\n'):
        print(line)
        

{{Wikipedia category}}
Articles assessed as part of the [[Wikipedia:Version 1.0 Editorial Team|Version 1.0 Editorial Team]]'s [[WP:WVWP|Work via WikiProjects]] scheme. 
A list of participating projects along with their statistics can be found in the [[Wikipedia:Version 1.0 Editorial Team/Index of subjects|Index of subjects]]. Article assessments by WikiProject can be found in [[:Category:Wikipedia 1.0 assessments]].
<!-- <categorytree>Articles by quality</categorytree> -->
{{CatAutoTOC}}

[[Category:Wikipedia 1.0 assessments|*]]


In [67]:
articles = []
counter = 0
for category in site.categories['A-Class_articles'].members():
    #print(category.name)
    if category.length > 0:
        for member in category.members():
            title = member.name.split(':')[1]
            articles.append(title)
            counter += 1
            if counter % 500 == 0:
                print(f"Grabbed {counter} A-Class article titles")

Grabbed 500 A-Class article titles
Grabbed 1000 A-Class article titles
Grabbed 1500 A-Class article titles
Grabbed 2000 A-Class article titles
Grabbed 2500 A-Class article titles
Grabbed 3000 A-Class article titles
Grabbed 3500 A-Class article titles
Grabbed 4000 A-Class article titles
Grabbed 4500 A-Class article titles
Grabbed 5000 A-Class article titles
Grabbed 5500 A-Class article titles


In [68]:
articles

['Ontario Highway 55',
 'Ontario Highway 404',
 'Ontario Highway 405',
 'Ontario Highway 427',
 'D21 road (Croatia)',
 'Ontario Highway 55',
 'Ontario Highway 404',
 'Ontario Highway 405',
 'Ontario Highway 427',
 'Annika Sörenstam',
 'Bessas (magister militum)',
 'Minden Blake',
 'Oswald Boelcke',
 'Alan Charlesworth',
 'Wilfred Clouston',
 'Hippolyte De La Rue',
 'Alan Deere',
 'Mato Dukovac',
 'Rudolf Frank',
 'Yuri Gagarin',
 'Gordon Gollob',
 'Hermann Graf',
 'Erich Hartmann',
 'Michael Herrick',
 'George Kenney',
 'Albert Kesselring',
 'Ernest J. King',
 'Egmont Prinz zur Lippe-Weißenfeld',
 'Frank Lukis',
 'Günther Lützow',
 'George Mackinolty',
 'Hans-Joachim Marseille',
 'Egon Mayer',
 'William McAloney',
 'Walter Nowotny',
 'Max-Hellmuth Ostermann',
 'Keith Park',
 'Heinrich Prinz zu Sayn-Wittgenstein',
 'Heinz-Wolfgang Schnaufer',
 'Elliot See',
 'Leonard Trent',
 'Werner Voss',
 'Hans Waldmann (fighter pilot)',
 'James Allen Ward',
 'Theodor Weissenberger',
 'Ennis Whitehea

In [42]:
site = mwclient.Site('en.wikipedia.org', clients_useragent='clip_embedding_wikipedia/0.0 (https://calebkruse.com/; caleb.krs@gmail.com)')

good_article_base = 'Category:A-Class_articles'
# start by getting all categories
categories = []
page = site.pages[good_article_base + '/all']
for line in page.text().split('\n'):
        #print(line)
        if line.startswith('*') and '[[' in line and ']]' in line:
            start = line.find('[[') + 2
            end = line.find(']]')
            category = line[start:end].split('|')[1]
            categories.append(category)

# get all articles in the categories
articles = []
for category in categories:
    page = site.pages[f'{good_article_base}/{category}']
    for line in page.text().split('\n'):
        if '[[' in line and ']]' in line and '|' not in line:
            start = line.find('[[') + 2
            end = line.find(']]')
            article = line[start:end]
            articles.append(article)
    

26731

In [33]:
categories

['Agriculture, food, and drink',
 'Art and architecture',
 'Engineering and technology',
 'Geography and places',
 'History',
 'Language and literature',
 'Mathematics',
 'Media and drama',
 'Music',
 'Natural sciences',
 'Philosophy and religion',
 'Social sciences and society',
 'Sports and recreation',
 'Video games',
 'Warfare']

In [39]:

articles = []
page = site.pages['Wikipedia:Good articles/Agriculture, food and drink']
for line in page.text().split('\n'):
    #print(line)
    if '[[' in line and ']]' in line and '|' not in line:
        start = line.find('[[') + 2
        end = line.find(']]')
        article = line[start:end]
        articles.append(article)
        
print(articles)

['Agriculture', 'Agriculture in Turkey', 'Agriculture in Wales', 'Animal husbandry', 'Apiary Laboratory', 'Aquaculture in the Philippines', 'Arab Agricultural Revolution', 'Avondale Agricultural Research Station', 'Belted Galloway', 'Biological pest control', 'Bitter orange', 'Boreray sheep', 'Briarcliff Farms', 'Cattle', 'Chicken', 'Citrus', 'Cocoa production in São Tomé and Príncipe', 'Columbian exchange', 'Companion planting', 'Coon hunting', 'Cotton production in Pakistan', 'Cowpea', 'Cow tipping', 'Crop diversity', 'Dairy in India', 'Domestic duck', 'Domestic rabbit', 'Domestication', 'Domestication of the sheep', 'Fisheries in the Philippines', 'Galloway cattle', 'Goat', 'Goat tower', 'Hemp in Kentucky', 'Highland cattle', 'History of agriculture', 'Lemon', 'Locust Plague of 1874', 'Mandarin orange', 'North American Piedmontese', 'Oat', 'Pekarangan', 'Permaculture', 'Pest control', 'Pig', 'Polled Dorset', 'Polyculture', 'Poultry', 'Quince', 'Rice-fish system', 'Rye', 'Sesame', 'S

In [31]:
import mwclient
import re
from time import sleep

def get_good_articles():
    # Connect to English Wikipedia
    site = mwclient.Site('en.wikipedia.org')
    
    # Dictionary to store articles by category
    articles_by_category = {}
    
    # List of all category pages that contain good articles
    category_pages = [
        'Wikipedia:Good articles/Agriculture, food and drink',
        'Wikipedia:Good articles/Art and architecture',
        'Wikipedia:Good articles/Engineering and technology',
        'Wikipedia:Good articles/Geography and places',
        'Wikipedia:Good articles/History',
        'Wikipedia:Good articles/Language and literature',
        'Wikipedia:Good articles/Mathematics, science, and technology',
        'Wikipedia:Good articles/Media and drama',
        'Wikipedia:Good articles/Music',
        'Wikipedia:Good articles/Natural sciences',
        'Wikipedia:Good articles/Philosophy and religion',
        'Wikipedia:Good articles/Social sciences and society',
        'Wikipedia:Good articles/Sports and recreation',
        'Wikipedia:Good articles/Video games',
        'Wikipedia:Good articles/Warfare'
    ]
    
    def extract_articles_from_text(text):
        # Regular expression to find article links, excluding categories and templates
        # Look for lines that start with * or # followed by a link
        article_pattern = r'[\*#]\s*\[\[([^]|]+?)(?:\|[^]]+)?\]\]'
        matches = re.finditer(article_pattern, text)
        
        articles = []
        for match in matches:
            title = match.group(1)
            # Skip Wikipedia: namespace and other special pages
            if not any(title.startswith(prefix) for prefix in ['Wikipedia:', 'Template:', 'Category:', 'Portal:']):
                articles.append(title)
        return articles
    
    # Process each category page
    for category_page_title in category_pages:
        try:
            print(f"Processing {category_page_title}...")
            page = site.pages[category_page_title]
            text = page.text()
            print(text)
            
            # Extract articles from this category
            category_articles = extract_articles_from_text(text)
            
            # Store articles under the category name (removing the prefix)
            category_name = category_page_title.replace('Wikipedia:Good articles/', '')
            articles_by_category[category_name] = category_articles
            
            # Be nice to Wikipedia's servers
            sleep(1)
            
        except Exception as e:
            print(f"Error processing {category_page_title}: {e}")
    
    return articles_by_category

def main():
    try:
        articles_by_category = get_good_articles()
        print(articles_by_category)
        
        # Print summary and save results
        total_articles = sum(len(articles) for articles in articles_by_category.values())
        print(f"\nFound {total_articles} good articles across {len(articles_by_category)} categories:")
        
    except Exception as e:
        print(f"An error occurred: {e}")


main()

Processing Wikipedia:Good articles/Agriculture, food and drink...
<noinclude>
{{Wikipedia:Good article nominations/Tab header}}
<div class="mw-collapsible mw-collapsed">
{{Wikipedia:Good articles/Summary|shortcuts={{shortcut|WP:GA/AF}}}}
</div></noinclude><templatestyles src="Wikipedia:Good articles/styles.css"/>
__NOTOC__
<div class="wp-ga-topic">
<noinclude>
==GA==
{{Wikipedia:Good articles/GA topic header}}</noinclude>
==Agriculture, food, and drink==
<includeonly><div class="wp-ga-topic-back">[[#Contents|back]]</div></includeonly>
<!--Start Agriculture, food, and drink level 3 GA subtopic-->
<div class="mw-collapsible">

===[[File:Tango icon nature.svg|22px|left|link=|alt=]] Agriculture, food, and drink===
<div class="mw-collapsible-content">

=====Agriculture and farming=====
{{#invoke:Good Articles|subsection|
[[Agriculture]]
[[Agriculture in Turkey]]
[[Agriculture in Wales]]
[[Animal husbandry]]
[[Apiary Laboratory]]
[[Aquaculture in the Philippines]]
[[Arab Agricultural Revolut

KeyboardInterrupt: 

In [None]:
# Access Wikipedia fetcher directly
wiki_fetcher = orchestrator.wiki_fetcher
vital_articles = wiki_fetcher.get_vital_articles(level=3)

vital_articles[:10]