In [55]:
!pip install mwclient



In [56]:
import mwclient
import time


In [57]:
!pip install peewee




In [70]:
from peewee import SqliteDatabase, Model, IntegerField, CharField, ForeignKeyField, DateTimeField
from datetime import datetime

db = SqliteDatabase('wikipedia.db')

class BaseModel(Model):
    class Meta:
        database = db

class Page(BaseModel):
    id = IntegerField(primary_key=True)
    name = CharField(unique=True, null=False)

    def to_dict(self):
        number_of_revisions = Revision.select().where(Revision.page == self).count()
        number_of_contributors = Revision.select().where(Revision.page == self).distinct().count()

        return {
            "id": self.id,
            "name": self.name,
            "number_of_revisions": number_of_revisions,
            "number_of_contributors": number_of_contributors,
        }

class Contributor(BaseModel):
    id = IntegerField(primary_key=True)
    username = CharField(unique=True, null=False)

    def to_dict(self):
        number_of_pages = Page.select().join(Revision).where(Revision.contributor == self).distinct().count()
        number_of_contributions = Revision.select().where(Revision.contributor == self).count()

        return {
            "id": self.id,
            "username": self.username,
            "number_of_pages": number_of_pages,
            "number_of_contributions": number_of_contributions,
        }

class MainCategory(BaseModel):
    id = IntegerField(primary_key=True)
    name = CharField(unique=True, null=False)
    number_of_subcategories = IntegerField(null=True)

    def to_dict(self):
        number_of_pages = Page.select().join(Revision).where(Revision.main_category == self).distinct().count()

        number_of_contributors = (
            Contributor.select()
            .join(Revision)
            .where(Revision.main_category == self)
            .distinct()
            .count()
        )

        number_of_contributions = Revision.select().where(Revision.main_category == self).count()

        return {
            "id": self.id,
            "name": self.name,
            "number_of_subcategories": self.number_of_subcategories,
            "number_of_pages": number_of_pages,
            "number_of_contributors": number_of_contributors,
            "number_of_contributions": number_of_contributions,
        }


class Revision(BaseModel):
    id = IntegerField(primary_key=True)
    main_category = ForeignKeyField(MainCategory, backref='revisions', on_delete='CASCADE')
    page = ForeignKeyField(Page, backref='revisions', on_delete='CASCADE')
    contributor = ForeignKeyField(Contributor, backref='revisions', on_delete='CASCADE')
    timestamp = DateTimeField(default=datetime.utcnow)

    class Meta:
        indexes = (
            (('id', 'main_category'), True),  # Composite unique index
        )

    def to_dict(self):
        return {
            "id": self.id,
            "main_category": self.main_category.name,
            "page": self.page.name,
            "contributor": self.contributor.username,
            "timestamp": self.timestamp,
        }

class Crawls(BaseModel):
    id = IntegerField(primary_key=True)
    main_category = ForeignKeyField(MainCategory, backref='crawls', on_delete='CASCADE')
    depth = IntegerField()
    start_time = DateTimeField()
    end_time = DateTimeField()

    def to_dict(self):
        return {
            "id": self.id,
            "main_category": self.main_category.name,
            "depth": self.depth,
            "start_time": self.start_time,
            "end_time": self.end_time,
        }


In [59]:
from peewee import SqliteDatabase

class DatabaseManager:
    def __init__(self):
        self.db = db
        if self.db.is_closed():
            self.db.connect()
        self._ensure_tables_exist()

    def _ensure_tables_exist(self):
        """Ensure the database tables are created."""
        self.db.create_tables([Page, Contributor, Revision, MainCategory, Crawls], safe=True)

    def get_or_create_page(self, page_id, page_name):
        page, created = Page.get_or_create(id=page_id, defaults={'name': page_name})
        return page

    def get_or_create_contributor(self, username):
        contributor, created = Contributor.get_or_create(username=username)
        return contributor

    def create_revision(self, revision_id, page, contributor, timestamp, main_category_id):
        revision, created = Revision.get_or_create(
            id=revision_id,
            defaults={
                'page': page,
                'contributor': contributor,
                'timestamp': timestamp,
                'main_category': main_category_id
            }
        )
        return revision

    def get_or_create_main_category(self, category_name):
        main_category, created = MainCategory.get_or_create(
            name=category_name,
        )
        return main_category
    
    def update_main_category(self, category_name, num_subcategories):
        main_category = MainCategory.get(name=category_name)
        main_category.number_of_subcategories = num_subcategories
        main_category.save()
        return main_category

    def create_crawl(self, main_category, depth, start_time, end_time):
        crawl = Crawls.create(
            main_category=main_category,
            depth=depth,
            start_time=start_time,
            end_time=end_time
        )
        return crawl

    def get_all_crawls(self):
            return Crawls.select()

    def get_all_revisions(self):
        return Revision.select()

    def get_all_pages(self):
        return Page.select()

    def get_all_contributors(self):
        return Contributor.select()

    def get_all_categories(self):
        return MainCategory.select()

    def close(self):
        if not self.db.is_closed():
            self.db.close()


In [60]:
from datetime import datetime, timezone

class WikipediaCategoryCrawler():
    def __init__(self, main_category, db_manager):
        self.site = mwclient.Site('en.wikipedia.org')
        self.visited_categories = set()
        self.pages = set()
        self.contributors = set()
        self.contributions = set()

        self.main_category = main_category
        self.db_manager = db_manager

    def get_categories(self, depth=3):

        def get_subcategories(category, current_depth):
            if current_depth > depth or category in self.visited_categories:
                return []
            
            self.visited_categories.add(category)
            cat = self.site.categories[category]
            subcats = []

            for subcat in cat:
                if subcat.namespace == 14: # a namespace of 14 indicates a category, see reference below
                    subcat_name = subcat.name.replace('Category:', '')
                    subcats.append(subcat_name)
                    if current_depth < depth:
                        subcats.extend(get_subcategories(subcat_name, current_depth + 1))

            return subcats

        all_subcategories = get_subcategories(self.main_category, 1)
        print(f'Number of Subcategories: {len(all_subcategories)}')

        for idx, subcategory in enumerate(all_subcategories, start=1):
            print(f'Subcategory {idx}/{len(all_subcategories)}: {subcategory}')
            self.get_category_pages(subcategory)
            

    def get_category_pages(self, category):
        cat = self.site.categories[category]

        for page in cat:
            if page.namespace == 0:
                tmp_page = self.db_manager.get_or_create_page(page_id=page.pageid, page_name=page.name)

                for revision in list(page.revisions()):
                    if 'user' in revision:
                        tmp_contributor = self.db_manager.get_or_create_contributor(username=revision['user'])
                    else:
                        tmp_contributor = self.db_manager.get_or_create_contributor(username="Unknown")

                    timestamp_struct = revision['timestamp']
                    timestamp_obj = datetime(*timestamp_struct[:6], tzinfo=timezone.utc)  # Convert struct_time to datetime
                    formatted_timestamp = timestamp_obj.strftime('%Y-%m-%dT%H:%M:%SZ')

                    self.db_manager.create_revision(
                        revision_id=revision['revid'],
                        page=tmp_page,
                        contributor=tmp_contributor,
                        timestamp=formatted_timestamp,
                        main_category_id=self.main_category_id
                    )

    def get_pages_and_contributions(self):
        self.get_category_pages(self.main_category)

    def crawl_category(self, depth=3):
        start_time = datetime.now(timezone.utc)
        self.main_category_id = self.db_manager.get_or_create_main_category(self.main_category)

        self.get_categories(depth)
        self.get_pages_and_contributions()


        end_time = datetime.now(timezone.utc)
        self.db_manager.create_crawl(
            main_category=self.main_category_id,
            depth=depth,
            start_time=start_time,
            end_time=end_time
        )

        self.db_manager.update_main_category(self.main_category, len(self.visited_categories))





Namespace: https://www.mediawiki.org/wiki/Manual:Namespace
Category is namespace 14, according to the doc

In [62]:
db_manager = DatabaseManager()

In [63]:
category = 'Amiga CD32 games'

wikipedia_crawler = WikipediaCategoryCrawler(category, db_manager)
wikipedia_crawler.crawl_category()

Number of Subcategories: 1
Subcategory 1/1: Cancelled Amiga CD32 games


In [73]:
wikipedia_crawler = WikipediaCategoryCrawler("Artificial intelligence", db_manager)
wikipedia_crawler.crawl_category()

Number of Subcategories: 361
Subcategory 1/361: Affective computing
Subcategory 2/361: Social robots
Subcategory 3/361: AI accelerators
Subcategory 4/361: AI software
Subcategory 5/361: Data mining and machine learning software
Subcategory 6/361: Deep learning software
Subcategory 7/361: Neural network software
Subcategory 8/361: Social network analysis software
Subcategory 9/361: Ambient intelligence
Subcategory 10/361: Internet of things
Subcategory 11/361: Automatic identification and data capture
Subcategory 12/361: Blockchains
Subcategory 13/361: Internet of things companies
Subcategory 14/361: Home automation
Subcategory 15/361: IoT malware
Subcategory 16/361: IPv6
Subcategory 17/361: Machine to machine
Subcategory 18/361: Network appliances
Subcategory 19/361: Sensor network
Subcategory 20/361: Smart bands
Subcategory 21/361: Smart cities
Subcategory 22/361: Smart grid
Subcategory 23/361: Smart TV
Subcategory 24/361: Wearable devices
Subcategory 25/361: Wearable devices
Subcateg

In [74]:
all_categories = db_manager.get_all_categories()
for category in all_categories:
    print(category.to_dict()) 

{'id': 1, 'name': 'Amiga CD32 games', 'number_of_subcategories': 2, 'number_of_pages': 143, 'number_of_contributors': 8013, 'number_of_contributions': 29370}
{'id': 2, 'name': 'Artificial intelligence', 'number_of_subcategories': 109, 'number_of_pages': 8695, 'number_of_contributors': 610800, 'number_of_contributions': 2481785}


In [76]:
all_categories = db_manager.get_all_categories()
for category in all_categories:
    print(category.to_dict()) 

{'id': 1, 'name': 'Amiga CD32 games', 'number_of_subcategories': 2, 'number_of_pages': 143, 'number_of_contributors': 8013, 'number_of_contributions': 29370}
{'id': 2, 'name': 'Artificial intelligence', 'number_of_subcategories': 109, 'number_of_pages': 8695, 'number_of_contributors': 610800, 'number_of_contributions': 2481785}


In [75]:
all_categories = db_manager.get_all_crawls()
for category in all_categories:
    print(category.to_dict()) 
    

{'id': 1, 'main_category': 'Amiga CD32 games', 'depth': 3, 'start_time': '2024-12-04 12:57:21.266809+00:00', 'end_time': '2024-12-04 13:01:37.014735+00:00'}
{'id': 2, 'main_category': 'Artificial intelligence', 'depth': 3, 'start_time': '2024-12-04 13:03:34.464555+00:00', 'end_time': '2024-12-04 22:08:44.644957+00:00'}


In [68]:
all_categories = db_manager.get_all_pages()
for category in all_categories:
    print(category.to_dict()) 

{'id': 28341, 'name': 'Simon the Sorcerer', 'number_of_revisions': 315, 'number_of_contributors': 315}
{'id': 67840, 'name': 'Zool', 'number_of_revisions': 485, 'number_of_contributors': 485}
{'id': 69179, 'name': 'Lemmings (video game)', 'number_of_revisions': 2217, 'number_of_contributors': 2217}
{'id': 224540, 'name': 'Project-X', 'number_of_revisions': 95, 'number_of_contributors': 95}
{'id': 224662, 'name': 'Rise of the Robots', 'number_of_revisions': 527, 'number_of_contributors': 527}
{'id': 253870, 'name': 'Treasure Island Dizzy', 'number_of_revisions': 178, 'number_of_contributors': 178}
{'id': 254113, 'name': 'Speedball (video game)', 'number_of_revisions': 348, 'number_of_contributors': 348}
{'id': 254654, 'name': 'The Chaos Engine', 'number_of_revisions': 274, 'number_of_contributors': 274}
{'id': 290504, 'name': 'Beneath a Steel Sky', 'number_of_revisions': 794, 'number_of_contributors': 794}
{'id': 311487, 'name': 'Catacomb 3-D', 'number_of_revisions': 297, 'number_of_con

In [69]:
all_categories = db_manager.get_all_revisions()
for category in all_categories:
    print(category.to_dict())

{'id': 144493, 'main_category': 'Amiga CD32 games', 'page': 'Lemmings (video game)', 'contributor': 'Imran', 'timestamp': '2002-08-08T04:16:17Z'}
{'id': 144496, 'main_category': 'Amiga CD32 games', 'page': 'Lemmings (video game)', 'contributor': '217.168.172.202', 'timestamp': '2002-08-08T04:17:53Z'}
{'id': 210273, 'main_category': 'Amiga CD32 games', 'page': 'Simon the Sorcerer', 'contributor': 'Conversion script', 'timestamp': '2002-02-25T15:51:15Z'}
{'id': 227012, 'main_category': 'Amiga CD32 games', 'page': 'Lemmings (video game)', 'contributor': '217.168.172.202', 'timestamp': '2002-08-08T04:19:23Z'}
{'id': 344048, 'main_category': 'Amiga CD32 games', 'page': 'Lemmings (video game)', 'contributor': 'Andre Engels', 'timestamp': '2002-09-18T17:47:57Z'}
{'id': 344190, 'main_category': 'Amiga CD32 games', 'page': 'Lemmings (video game)', 'contributor': '165.139.124.250', 'timestamp': '2002-10-08T18:10:35Z'}
{'id': 549871, 'main_category': 'Amiga CD32 games', 'page': 'Lemmings (video g

In [36]:
all_categories = db_manager.get_all_contributors()
for category in all_categories:
    print(category.to_dict()) 

{'id': 1, 'username': 'Cyberlink420', 'number_of_pages': 6, 'number_of_contributions': 8}
{'id': 2, 'username': 'Mika1h', 'number_of_pages': 101, 'number_of_contributions': 373}
{'id': 3, 'username': 'Dgpop', 'number_of_pages': 97, 'number_of_contributions': 291}
{'id': 4, 'username': 'Citation bot', 'number_of_pages': 94, 'number_of_contributions': 198}
{'id': 5, 'username': 'Qwerfjkl (bot)', 'number_of_pages': 3, 'number_of_contributions': 3}
{'id': 6, 'username': 'Wbm1058', 'number_of_pages': 6, 'number_of_contributions': 7}
{'id': 7, 'username': 'JJMC89 bot III', 'number_of_pages': 137, 'number_of_contributions': 311}
{'id': 8, 'username': 'Lennart97', 'number_of_pages': 2, 'number_of_contributions': 2}
{'id': 9, 'username': 'Deltasim', 'number_of_pages': 31, 'number_of_contributions': 115}
{'id': 10, 'username': 'I dream of horses', 'number_of_pages': 16, 'number_of_contributions': 19}
{'id': 11, 'username': 'Rodw', 'number_of_pages': 12, 'number_of_contributions': 14}
{'id': 12, 