In [2]:
"""Get user statistics from Wikipedia"""

import logging
import os
import base64
import json
from dataclasses import dataclass, field
from functools import cached_property
from typing import Self

from IPython.display import display
import pandas
import requests


# %config Application.log_level='WORKAROUND'
# => fails, necessary on Fedora 27, ipython3 6.2.1
%config Application.log_level='INFO'

logging.getLogger().setLevel(logging.INFO)

In [7]:
@dataclass
class WikipediaAPI:
    """A class for interacting with the Wikipedia API."""

    lang: str

    @property
    def url(self) -> str:
        """Get the URL of the Wikipedia API endpoint.

        :return: The URL of the Wikipedia API endpoint.
        """
        return f"https://{self.lang}.wikipedia.org/w/api.php"

    def call(
        self, query: dict[str, str], method: str = "get"
    ) -> dict[str, dict[str, str]]:
        """Sends an HTTP GET or POST request to the specified URL with the given query parameters.

        :param query: A dictionary containing the query parameters for the request.
        :param method: The HTTP method to be used for the request. Default is "get".
        :return: A dictionary containing the response from the request, if successful.
        """
        if method == "get":
            result = requests.get(self.url, params=query, timeout=60)
        elif method == "post":
            result = requests.post(self.url, data=query, timeout=60)
        else:
            result = None

        try:
            result = result.json()

        except ValueError:
            logging.critical(f"{self.url=}")
            logging.critical(f"{query=}")

        return result


@dataclass
class WikipediaUserContributionsPage:
    """Represents a Wikipedia user contributions page.

    :param result: A list of dictionaries representing the user contributions.
    :param next_info: A dictionary containing information about the next page of user contributions.
    """

    result: list[dict[str, str]]
    next_info: dict[str, str]

    def to_dict(self) -> dict[str, dict[str, str]]:
        """Serializes the instance to a dictionary.

        :return: A dictionary representation of the instance.
        """
        return {"result": self.result, "next_info": self.next_info}

    @classmethod
    def from_dict(cls, user_contributions_page: dict[str, dict[str, str]]) -> Self:
        """Deserializes an instance of the class using a dictionary representation.

        :param user_contributions_page: A dictionary containing user contribution page data.
        :return: An instance of the class constructed using the provided dictionary.
        """
        return cls(
            result=user_contributions_page["result"],
            next_info=user_contributions_page["next_info"],
        )


@dataclass
class WikipediaUserContributionsHelper:
    """Helper class to retrieve and cache user contributions from Wikipedia.

    :param user_name: The username of the Wikipedia user.
    :param lang: The language code of the Wikipedia.
    :param next_info: The dictionary containing the values for pagination.
    """

    user_name: str
    lang: str
    next_info: dict[str, str] = field(default=None)

    @property
    def params(self) -> dict[str, str]:
        """Returns the parameters to be used in the API request.

        :return: A dictionary of parameters.
        """
        params = {
            "action": "query",
            "format": "json",
            "list": "usercontribs",
            "ucuser": self.user_name,
            "uclimit": "500",
            "continue": "",
            "ucprop": "ids|title|timestamp|flags|tags|sizediff",
            "formatversion": "2",
            "ucdir": "newer",
        }
        if self.next_info:
            params["continue"] = self.next_info["continue"]
            params["uccontinue"] = self.next_info["uccontinue"]
        return params

    @property
    def _distant_contributions(self) -> WikipediaUserContributionsPage:
        """Retrieves the page of contributions from wikipedia.

        :return: The user's distant contributions.
        """
        logging.info("Getting data form wikipedia…")
        api: WikipediaAPI = WikipediaAPI(lang=self.lang)

        request = api.call(self.params)

        result = request["query"]["usercontribs"]
        next_info = request.get("continue")

        user_contributions_page = WikipediaUserContributionsPage(result, next_info)

        return user_contributions_page

    @property
    def cached_contributions(self) -> WikipediaUserContributionsPage:
        """Returns a page of contributions from cache or get it from wikipedia.

        :return: The page of contributions of the user.
        """
        file_name = self.get_filename(self.next_info)
        file_dir = f"data/{self.lang}/{self.user_name}"
        file_path = f"{file_dir}/{file_name}"

        user_contributions_page = None

        if not os.path.exists(file_dir):
            os.makedirs(file_dir)

        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as contributions_page_file:
                contributions_page = json.load(contributions_page_file)
                user_contributions_page = WikipediaUserContributionsPage.from_dict(
                    contributions_page
                )
                logging.info(
                    f"Loaded {len(user_contributions_page.result)} contributions information "
                    f"in {file_path}"
                )
                if not user_contributions_page.next_info:
                    logging.info("This was the last cached page, we will re-query it.")
        if not user_contributions_page or not user_contributions_page.next_info:
            user_contributions_page = self._distant_contributions
            with open(file_path, "w", encoding="utf-8") as contributions_page_file:
                json.dump(user_contributions_page.to_dict(), contributions_page_file)
                logging.info(
                    f"Saved {len(user_contributions_page.result)} contributions information "
                    f"in {file_path}"
                )
        return user_contributions_page

    @staticmethod
    def get_filename(next_info: dict[str, str]) -> str:
        """Converts a dictionary to a base64-encoded JSON string.

        :param next_info: A dictionary containing the information to be converted.
        :return: The base64 encoded JSON string.
        """
        # Convert the dict to a JSON string
        json_str = json.dumps(next_info)

        # Convert the JSON string to bytes, so it can be encoded by base64
        json_bytes = json_str.encode("utf-8")

        # Encode the bytes
        base64_bytes = base64.b64encode(json_bytes)

        # Convert the base64 bytes to a string for easier output and storage
        base64_str = base64_bytes.decode("utf-8")

        return base64_str


@dataclass
class WikipediaUser:
    """Represents a user on Wikipedia.

    :param lang: the language of the Wikipedia site the user belongs to.
    :param name: the username of the user.
    """

    lang: str
    name: str

    def fetch_contribs(self) -> dict[str, str]:
        """Fetches the user contributions for the specified user and language.

        :return: The user contributions.
        """

        contributions = []
        next_info = {"uccontinue": None, "continue": None}

        while next_info:
            uc_helper = WikipediaUserContributionsHelper(
                self.name, self.lang, next_info=next_info
            )
            contributions_page = uc_helper.cached_contributions
            contributions += contributions_page.result
            next_info = contributions_page.next_info

        return contributions


class WikipediaNamespaces:
    """Retrieves the namespaces in Wikipedia.

    :param lang: The language code of the Wikipedia site.
    """

    def __init__(self, lang="en"):
        self.lang = lang

    @cached_property
    def namespaces(self) -> dict[int:str]:
        """Retrieves the namespaces available in the Wikipedia API.

        :return: A dictionary containing the namespace IDs as keys and the
        corresponding namespace names as values.
        """
        api = WikipediaAPI(lang=self.lang)

        params = {
            "action": "query",
            "meta": "siteinfo",
            "siprop": "namespaces",
            "format": "json",
            "formatversion": "2",
        }

        result = api.call(params)
        namespaces = {
            result["query"]["namespaces"][namespace]["id"]: result["query"][
                "namespaces"
            ][namespace].get("canonical", "Main")
            for namespace in result["query"]["namespaces"]
        }

        return namespaces

In [4]:
@dataclass
class UserStats:
    """Analyses user statistics from Wikipedia.

    :param user_name: The name of the user.
    :param lang: The language of the Wikipedia site.
    """

    user_name: str
    lang: str

    @cached_property
    def user(self) -> WikipediaUser:
        """Returns the WikipediaUser object for the given username and language.

        :return: An instance of WikipediaUser representing the user.
        """
        return WikipediaUser(name=self.user_name, lang=self.lang)

    @cached_property
    def revisions(self) -> pandas.DataFrame:
        """Processes revisions information from the user's contributions.

        :return:  The processed revisions' data.
        """
        raw_revisions = pandas.DataFrame(self.user.fetch_contribs())
        revisions = self.decode_namespaces(raw_revisions)
        revisions = self.decode_dates(revisions)
        revisions = self.add_absolute_sizediff(revisions)
        return revisions

    def decode_namespaces(self, revisions: pandas.DataFrame) -> pandas.DataFrame:
        """Decode namespaces

        :param revisions: A pandas DataFrame containing revisions data.
        :return: A pandas DataFrame with an additional column "decoded_namespace" that maps
        the namespace codes in the "ns" column to their corresponding names.
        """
        mediawiki_namespaces = WikipediaNamespaces(lang=self.lang).namespaces

        revisions["decoded_namespace"] = revisions["ns"].map(mediawiki_namespaces)
        return revisions

    @staticmethod
    def decode_dates(revisions: pandas.DataFrame) -> pandas.DataFrame:
        """Decode dates method takes a pandas DataFrame as input and decodes the timestamps
        in the 'timestamp' column into datetime format.

        :param revisions: The revisions' data.
        :return: A pandas DataFrame with the 'date' column containing decoded datetime values.
        """
        revisions["date"] = pandas.to_datetime(revisions["timestamp"])
        return revisions

    @staticmethod
    def add_absolute_sizediff(revisions: pandas.DataFrame) -> pandas.DataFrame:
        """Add the absolute values of the "sizediff" column to the given DataFrame.

        :param revisions: A pandas DataFrame containing the revisions data.
        :return: A pandas DataFrame with an additional column named "absolute_sizediff" which
        contains the absolute values of the "sizediff" column from the input DataFrame.
        """
        revisions["absolute_sizediff"] = revisions["sizediff"].abs()
        return revisions

    def filtered_revisions(self, month: int, year: int) -> pandas.DataFrame:
        """Filters revisions based on the provided month and year.

        :param month: The month for filtering revisions (1-12).
        :param year: The year for filtering revisions.
        :return: A pandas DataFrame containing the filtered revisions.
        """
        mask = (self.revisions["date"].dt.year == year) & (
            self.revisions["date"].dt.month == month
        )
        revisions_filtered = self.revisions[mask]
        return revisions_filtered

    @property
    def revision_count(self) -> int:
        """Get the count of revisions.

        :return: The count of revisions.
        """
        return len(self.revisions)

    @property
    def revision_tags(self) -> list[str]:
        """Retrieve the unique revision tags from the revisions attribute.

        :return: A list of unique revision tags.
        """
        tags = self.revisions["tags"].explode()
        return tags.unique()

    @property
    def revision_keys(self) -> list[str]:
        """Retrieve the keys of the revisions.

        :return: A list of keys from the revisions.
        """
        return self.revisions.keys()

    @property
    def revision_namespaces(self) -> list[str]:
        """Returns a list of unique revision namespaces.

        :return: A list of unique revision namespaces.
        """
        namespaces = self.revisions["decoded_namespace"]
        return namespaces.unique()

    def advanced_revisions_count_per_namespace(
        self, month=None, year=None
    ) -> pandas.DataFrame:
        """Get a pivot table for a given month and year or for the whole history with statistics
        per namespaces.

        :param month: The month for which to retrieve revisions (optional, default None)
        :param year: The year for which to retrieve revisions (optional, default None)
        :return: A DataFrame containing the advanced revisions count per namespace
        """
        if month and year:
            revisions = self.filtered_revisions(month, year)
        else:
            revisions = self.revisions.copy()

        if len(revisions) > 0:

            total_lines = revisions.shape[0]
            total_diff_sizes = revisions["absolute_sizediff"].sum()

            # create a pivot table
            pivot_table = revisions.pivot_table(
                index="decoded_namespace",
                values="absolute_sizediff",
                aggfunc=[len, "sum"],
            )

            # Add a column for distinct pages
            distinct_pages_pivot = revisions.pivot_table(
                index="decoded_namespace",
                values="title",
                aggfunc=[pandas.Series.nunique],
            )

            # Join
            pivot_table = pivot_table.join(distinct_pages_pivot)

            # Rename columns
            pivot_table.columns = [
                "count_of_lines",
                "sum_of_diff_sizes",
                "distinct_pages",
            ]

            # calculate percentages
            pivot_table["percentage_of_lines"] = (
                pivot_table["count_of_lines"] / total_lines * 100
            )
            pivot_table["percentage_of_diff_sizes"] = (
                pivot_table["sum_of_diff_sizes"] / total_diff_sizes * 100
            )

            # Sort the dataframe by 'percentage_of_lines' in descending order
            pivot_table_sorted = pivot_table.sort_values(
                "percentage_of_lines", ascending=False
            )

            # result
            return pivot_table_sorted
        return None

    @property
    def revisions_count_per_title(self) -> pandas.DataFrame:
        """Returns a pandas DataFrame with the count of revisions per title.

        :return: A pandas DataFrame with two columns: "title" and "count".
         The "title" column contains unique titles from the revisions dataset,
         while the "count" column contains the corresponding count of revisions per title.

        """
        return self.revisions["title"].value_counts()

In [8]:
my_user = UserStats(user_name="Florck", lang="fr")
display(my_user.advanced_revisions_count_per_namespace())
display(my_user.advanced_revisions_count_per_namespace(month=3, year=2024))
display(my_user.advanced_revisions_count_per_namespace(month=4, year=2024))

INFO:root:Loaded 500 contributions information in data/fr/Florck/eyJ1Y2NvbnRpbnVlIjogbnVsbCwgImNvbnRpbnVlIjogbnVsbH0=
INFO:root:Loaded 193 contributions information in data/fr/Florck/eyJ1Y2NvbnRpbnVlIjogIjIwMjQwNDAzMTg1NzA5fDIxMzkzOTQ4NCIsICJjb250aW51ZSI6ICItfHwifQ==
INFO:root:This was the last cached page, we will re-query it.
INFO:root:Getting data form wikipedia…
INFO:root:Saved 194 contributions information in data/fr/Florck/eyJ1Y2NvbnRpbnVlIjogIjIwMjQwNDAzMTg1NzA5fDIxMzkzOTQ4NCIsICJjb250aW51ZSI6ICItfHwifQ==


Unnamed: 0_level_0,count_of_lines,sum_of_diff_sizes,distinct_pages,percentage_of_lines,percentage_of_diff_sizes
decoded_namespace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Main,353,244228,183,50.864553,54.739881
Talk,224,122170,30,32.276657,27.382492
Project,55,43076,13,7.925072,9.654811
User talk,46,33831,12,6.628242,7.582689
User,12,862,6,1.729107,0.193204
Project talk,4,1994,2,0.576369,0.446924


Unnamed: 0_level_0,count_of_lines,sum_of_diff_sizes,distinct_pages,percentage_of_lines,percentage_of_diff_sizes
decoded_namespace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Main,253,102341,145,56.222222,45.209013
Talk,115,63491,17,25.555556,28.047073
Project,43,32737,7,9.555556,14.46153
User talk,34,27738,8,7.555556,12.253228
User,5,66,3,1.111111,0.029155


Unnamed: 0_level_0,count_of_lines,sum_of_diff_sizes,distinct_pages,percentage_of_lines,percentage_of_diff_sizes
decoded_namespace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Talk,104,55903,14,43.697479,25.760445
Main,99,141886,38,41.596639,65.381939
Project,12,10339,7,5.042017,4.764275
User talk,12,6093,5,5.042017,2.807692
User,7,796,3,2.941176,0.366802
Project talk,4,1994,2,1.680672,0.918847
