In [16]:
!pip install mwclient



In [17]:
import mwclient
import time


In [18]:
!pip install peewee




In [19]:
from peewee import SqliteDatabase, Model, IntegerField, CharField, ForeignKeyField, DateTimeField
from datetime import datetime

db = SqliteDatabase('wikipedia.db')

class BaseModel(Model):
    class Meta:
        database = db

class Page(BaseModel):
    id = IntegerField(primary_key=True)
    name = CharField(unique=True, null=False)

class Contributor(BaseModel):
    id = IntegerField(primary_key=True)
    username = CharField(unique=True, null=False)

class MainCategory(BaseModel):
    id = IntegerField(primary_key=True)
    name = CharField(unique=True, null=False)
    number_of_subcategories = IntegerField(null=True)

    def to_dict(self):
        number_of_pages = Page.select().join(Revision).where(Revision.main_category == self).distinct().count()

        number_of_contributors = (
            Contributor.select()
            .join(Revision)
            .where(Revision.main_category == self)
            .distinct()
            .count()
        )

        number_of_contributions = Revision.select().where(Revision.main_category == self).count()

        return {
            "id": self.id,
            "name": self.name,
            "number_of_subcategories": self.number_of_subcategories,
            "number_of_pages": number_of_pages,
            "number_of_contributors": number_of_contributors,
            "number_of_contributions": number_of_contributions,
        }


class Revision(BaseModel):
    id = IntegerField()
    main_category = ForeignKeyField(MainCategory, backref='revisions', on_delete='CASCADE')
    page = ForeignKeyField(Page, backref='revisions', on_delete='CASCADE')
    contributor = ForeignKeyField(Contributor, backref='revisions', on_delete='CASCADE')
    timestamp = DateTimeField(default=datetime.utcnow)

    class Meta:
        indexes = (
            (('id', 'main_category'), True),  # Composite unique index
        )

class Crawls(BaseModel):
    id = IntegerField(primary_key=True)
    main_category = ForeignKeyField(MainCategory, backref='crawls', on_delete='CASCADE')
    depth = IntegerField()
    start_time = DateTimeField()
    end_time = DateTimeField()


In [20]:
from peewee import SqliteDatabase

class DatabaseManager:
    def __init__(self):
        self.db = db
        if self.db.is_closed():
            self.db.connect()
        self._ensure_tables_exist()

    def _ensure_tables_exist(self):
        """Ensure the database tables are created."""
        self.db.create_tables([Page, Contributor, Revision, MainCategory, Crawls], safe=True)

    def get_or_create_page(self, page_id, page_name):
        page, created = Page.get_or_create(id=page_id, defaults={'name': page_name})
        return page

    def get_or_create_contributor(self, username):
        contributor, created = Contributor.get_or_create(username=username)
        return contributor

    def create_revision(self, revision_id, page, contributor, timestamp, main_category_id):
        revision, created = Revision.get_or_create(
            id=revision_id,
            defaults={
                'page': page,
                'contributor': contributor,
                'timestamp': timestamp,
                'main_category': main_category_id
            }
        )
        return revision

    def get_or_create_main_category(self, category_name):
        main_category, created = MainCategory.get_or_create(
            name=category_name,
        )
        return main_category
    
    def update_main_category(self, category_name, num_subcategories):
        main_category = MainCategory.get(name=category_name)
        main_category.number_of_subcategories = num_subcategories
        main_category.save()
        return main_category

    def create_crawl(self, main_category, depth, start_time, end_time):
        crawl = Crawls.create(
            main_category=main_category,
            depth=depth,
            start_time=start_time,
            end_time=end_time
        )
        return crawl

    def get_all_crawls(self):
            return Crawls.select()

    def get_all_revisions(self):
        return Revision.select()

    def get_all_pages(self):
        return Page.select()

    def get_all_contributors(self):
        return Contributor.select()

    def get_all_categories(self):
        return MainCategory.select()

    def close(self):
        if not self.db.is_closed():
            self.db.close()


In [21]:
class WikipediaCategoryCrawler():
    def __init__(self, main_category, db_manager):
        self.site = mwclient.Site('en.wikipedia.org')
        self.visited_categories = set()
        self.pages = set()
        self.contributors = set()
        self.contributions = set()

        self.main_category = main_category
        self.db_manager = db_manager

    def get_categories(self, depth=3):

        def get_subcategories(category, current_depth):
            if current_depth > depth or category in self.visited_categories:
                return []
            
            self.visited_categories.add(category)
            cat = self.site.categories[category]
            subcats = []

            for subcat in cat:
                if subcat.namespace == 14: # a namespace of 14 indicates a category, see reference below
                    subcat_name = subcat.name.replace('Category:', '')
                    subcats.append(subcat_name)
                    if current_depth < depth:
                        subcats.extend(get_subcategories(subcat_name, current_depth + 1))

            return subcats

        all_subcategories = get_subcategories(self.main_category, 1)
        print(f'Number of Subcategories: {len(all_subcategories)}')

        for idx, subcategory in enumerate(all_subcategories, start=1):
            print(f'Subcategory {idx}/{len(all_subcategories)}: {subcategory}')
            self.get_category_pages(subcategory)
            

    def get_category_pages(self, category):
        cat = self.site.categories[category]

        for page in cat:
            if page.namespace == 0:
                tmp_page = self.db_manager.get_or_create_page(page_id=page.pageid, page_name=page.name)

                for revision in list(page.revisions()):
                    if 'user' in revision:
                        tmp_contributor = self.db_manager.get_or_create_contributor(username=revision['user'])
                    else:
                        tmp_contributor = self.db_manager.get_or_create_contributor(username="Unknown")

                    timestamp_struct = revision['timestamp']
                    formatted_timestamp = time.strftime('%Y-%m-%dT%H:%M:%SZ', timestamp_struct)

                    self.db_manager.create_revision(
                        revision_id=revision['revid'],
                        page=tmp_page,
                        contributor=tmp_contributor,
                        timestamp=formatted_timestamp,
                        main_category_id=self.main_category_id
                    )

    def get_pages_and_contributions(self):
        self.get_category_pages(category_name)

    def crawl_category(self, depth=3):
        start_time = datetime.utcnow()

        self.main_category_id = self.db_manager.get_or_create_main_category(self.main_category)

        self.get_categories(depth)
        self.get_pages_and_contributions()


        end_time = datetime.utcnow()
        self.db_manager.create_crawl(
            main_category=self.main_category,
            depth=depth,
            start_time=start_time,
            end_time=end_time
        )

        self.db_manager.update_main_category(self.main_category, len(self.visited_categories))





Namespace: https://www.mediawiki.org/wiki/Manual:Namespace
Category is namespace 14, according to the doc

In [22]:
db_manager = DatabaseManager()

In [23]:
wikipedia_crawler = WikipediaCategoryCrawler("Artificial intelligence", db_manager)
wikipedia_crawler.crawl_category()

Number of Subcategories: 361
Subcategory 1/361: Affective computing


KeyboardInterrupt: 

In [9]:
all_crawls = db_manager.get_all_categories()
for crawl in all_crawls:
    print(crawl.to_dict()) 

{'id': 1, 'name': 'Artificial intelligence', 'number_of_subcategories': None, 'number_of_pages': 6829, 'number_of_contributors': 375623, 'number_of_contributions': 1477763}
