In [1]:
%pip install -q pandas pyarrow tqdm

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from os.path import join


# Define the namespaces to be excluded
EXCLUDED_PREFIXES = [
    'Wikipédia:', 'Categoria:', 'Predefinição:', 'Ficheiro:', 'Portal:',
    'Módulo:', 'Tópico:', 'Ajuda:', 'MediaWiki:', 'Livro:', 'TimedText:'
]


def filter_redirects(redirect_array: [str], page_title_lower: set):
    for redirect in redirect_array:
        redirect_ref = redirect.split('|')[0].split('#')[0].strip()
        
        # Check if the redirect does not start with any excluded prefix
        if not any(redirect_ref.startswith(prefix) for prefix in EXCLUDED_PREFIXES):
            if redirect_ref.lower() in page_title_lower:
                return [redirect_ref]


def process_page(wikinamedate: str) -> None:
    print(f'[INFO] Reading {wikinamedate.replace("/", "-")}/raw.parquet')
    df = pd.read_parquet(join('./output/', wikinamedate.replace('/', '-'), 'raw.parquet'))
    
    print(f'[INFO] Filtering {wikinamedate.replace("/", "-")}/raw.parquet')
    df = df[df['Page Namespace'] == '0']

    page_title_lower = set(df['Page Title'].str.lower().values)
    df['Page References'] = df['Page References'].apply(lambda redirect_array: filter_redirects(redirect_array, page_title_lower))

    df.to_parquet(join('./output/', wikinamedate.replace('/', '-'), 'processed.parquet'))

    print(f"[INFO] {df['Page References'].count()} out of {df['Page Title'].count()} ({round(100 * (df['Page References'].count()/df['Page Title'].count()), 2)}%) pages have at least one Page Reference")
    
    del page_title_lower, df

In [6]:
%%time
# Restore variable from different Jupyter notebook
%store -r wikinamedate

process_page(wikinamedate)

[INFO] Reading ptwiki-20240720/raw.parquet
[INFO] Filtering ptwiki-20240720/raw.parquet
[INFO] 1135416 out of 1909778 (59.45%) pages have a Target Page Name
CPU times: user 1min 5s, sys: 7.74 s, total: 1min 13s
Wall time: 1min 7s
