# Wikipedia Page Reference Analysis

This Jupyter notebook contains Python code for analyzing the reference structure of Wikipedia pages. It includes two main components:

1. A `Counter` class for processing and analyzing page references
2. Interactive cells for user input and result display

## Features:

- Efficient data handling using Pandas and Parquet
- Analysis of reference degrees (how many steps away pages are from a given page)
- Calculation of weighted mean distance, total nodes, and network diameter
- Option to consider only the first reference or all references for each page

## Dependencies:

- pandas
- os
- typing

## Note:

The analysis can be performed considering either only the first reference of each page or all references. This allows for different perspectives on the Wikipedia link structure.

In [20]:
import pandas as pd
from os.path import join
from igraph import Graph

from typing import List, Set  # For type hinting


def get_ordinal_suffix(n: int) -> str:
    """
    Get the ordinal suffix for a given integer.

    Args:
        n (int): The integer for which to determine the ordinal suffix.

    Returns:
        str: The ordinal suffix (e.g., 'st', 'nd', 'rd', 'th').
    """

    if 10 <= n % 100 <= 20:
        return "th"
    return {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")


def filter_df(df: pd.DataFrame) -> pd.DataFrame:
    print("[INFO] Applying modifiers to DataFrame")

    # Convert 'Page Title' to lowercase and ensure 'Page References' are lists of lowercase references
    df["Page Title"] = df["Page Title"].str.lower()
    df["Page References"] = df["Page References"].apply(
        lambda refs: [ref.lower() for ref in refs]
    )

    return df


def read_parquet(wikinamedate: str) -> None:
    """
    Reads a processed Parquet file containing Wikipedia page data and returns it as a DataFrame.

    Args:
        wikinamedate (str): The date and name string used to locate the Parquet file.

    Returns:
        pd.DataFrame: The DataFrame containing the processed page data.
    """

    print(f"[INFO] Reading '{wikinamedate.replace('/', '-')}/processed.parquet'")

    df = pd.read_parquet(
        join("../output/", wikinamedate.replace("/", "-"), "processed.parquet")
    )

    return df


def create_graph(df: pd.DataFrame, first_ref: bool, batch_size: int = 10_000) -> Graph:
    # Create the directed graph using iGraph
    g = Graph(directed=True)

    # Extract the page titles and references
    page_titles = df['Page Title'].values
    references = df['Page References'].values

    # Add vertices to the graph
    g.add_vertices(page_titles)

    # Prepare edges for batch processing
    batch_edges = []
    
    # Iterate through page titles and references using numpy's zip for speed
    for page, refs in zip(page_titles, references):
        if len(refs) > 0:
            if first_ref:
                batch_edges.append((page, refs[0]))
            else:
                batch_edges.extend((page, ref) for ref in refs)

        # Add edges in batches for memory and performance efficiency
        if len(batch_edges) >= batch_size:
            g.add_edges(batch_edges)
            batch_edges.clear()  # Efficiently reset the list

    # Add any remaining edges to the graph
    if batch_edges:
        g.add_edges(batch_edges)

    return g


def calculate_degree_series(g: Graph, page_title: str, first_ref=False):
    # Start with the initial page
    starting_vertex = page_title.lower()

    # Initialize processed sets and counters
    processed_pages = set()
    pages_to_process = {starting_vertex}
    counts = []

    # Breadth-First Search (BFS) to determine the distance from the starting page
    while pages_to_process:
        degree = len(counts) + 1
        print(f"Checking {degree}{get_ordinal_suffix(degree)} degree of distance", end="\r")

        # Find the vertices that are at the current degree of reference
        new_pages = set()
        for v in pages_to_process:
            neighbors = g.neighbors(v, mode="out")  # Get all outgoing neighbors
            new_pages.update(g.vs[neighbors]["name"])

        # Calculate the new pages to process and update counts
        new_pages -= processed_pages  # Remove already processed pages
        counts.append(len(new_pages))
        processed_pages.update(new_pages)
        pages_to_process = new_pages


    print()
    print(f"Degree Series: {str(counts)}")
    return counts

In [2]:
%%time
# Restore variable from different Jupyter notebook
%store -r wikinamedate

# Reads a processed Parquet file containing Wikipedia page data
df = filter_df(read_parquet(wikinamedate))
print()


[INFO] Reading 'ptwiki-20240720/processed.parquet'
[INFO] Applying modifiers to DataFrame

CPU times: user 50.7 s, sys: 4.58 s, total: 55.3 s
Wall time: 50.7 s


In [14]:
%%time

first_ref = True

# Create Graph
graph = create_graph(df, True)
print()



CPU times: user 52.9 s, sys: 149 ms, total: 53 s
Wall time: 52.9 s


<igraph.seq.EdgeSeq at 0x7f67462b9550>