In [None]:
!pip3 install wikipedia-api

# Get root pages from supercategories

In [None]:
import wikipediaapi
import pandas as pd 
from wikiextractor.clean import clean_markup

def get_categorymembers(categorymembers, level=0, max_level=1, root_pages=None):
    """
    Recursively retrieves titles of pages within a Wikipedia category, including titles from nested subcategories up to a specified depth.

    :param categorymembers: A dictionary-like object containing category members from the wikipediaapi.
    :param level: The current level of depth in category traversal. Defaults to 0.
    :param max_level: The maximum depth for category traversal. Defaults to 1.
    :param root_pages: A list to store the titles of pages. Defaults to None, in which case it initializes to an empty list.

    :return: A list containing the titles of all pages and subcategory pages up to the specified depth.
    """
    if root_pages is None:
        root_pages = []

    for c in categorymembers.values():
        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            get_categorymembers(c.categorymembers, level=level + 1, max_level=max_level, root_pages=root_pages)
        else:
            if c.title.startswith('Category'):
                try:
                    title  = c.title.split(":")[1]
                except:
                    continue
            else:
                title = c.title
            root_pages.append(title)
    return root_pages

def get_root_pages_from_category(category_name):
    """
    Retrieves the root pages from a specified Wikipedia category.
    
    :param category_name: The name of the Wikipedia category from which to retrieve pages.

    :return: A list of root page titles from the specified Wikipedia category.
    """
    cat = wiki_wiki.page(f"Category:{category_name}")
    return get_categorymembers(cat.categorymembers)

# Example usage:

# load wiki
API_KEY = 'ProjName (email)'
wiki_wiki = wikipediaapi.Wikipedia(API_KEY, 'en')

# select super domains
domains = ["Statistics", "Artificial intelligence", "Computational mathematics", "Numerical analysis", "Applied mathematics", "Probability"]
root_pages = []

# load all root pages
for domain in domains:
    root_page = get_root_pages_from_category(domain)
    root_pages += root_page

root_pages = set(root_pages)

# Get page content from all pages

In [None]:

# convert data into dataframe and save it into parquet file
wiki_data = {"topic":[], "text":[]}
for page in root_pages:
   wiki_data["topic"].append(page)
   wiki_data["text"].append(wiki_wiki.page(page).text)

wiki_df = pd.DataFrame(wiki_data)
wiki_df = wiki_df.query("text.str.len() != 0")
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.to_parquet('wiki_ml.parquet')

# Clean page text

In [1]:


def find_noise_section(strings):
    """
    Identifies the index of a 'noise' section in a list of strings.

    :param strings: A list of strings, typically representing lines or sections of text.

    :return: The index of the first occurrence of a 'noise' section.
    """
    for index, string in enumerate(strings):
        if string.startswith("See also.") or string.startswith("External links.") or string.startswith("References.") or string.startswith("Bibliography."):
            return index
    return -1

def clean_text(text):
    """
    Cleans and reformats the provided text by processing paragraphs and removing certain noise sections.

    :param text: The text to be cleaned, typically containing structured or semi-structured content.

    :return: The cleaned and reformatted text without the identified noise sections.
    """
    
    paragraphs = list(clean_markup(text, ignore_headers=False))
    
    # text = [paragraph for paragraph in paragraphs if len(paragraph) > 3]
    text = []
    for paragraph in paragraphs:
        if len(paragraph) > 5 or len(text) == 0:
            text.append(paragraph)
        else:
            text[-1] = text[-1] + " " + " ".join(paragraph.split(" "))
    text = "\n".join(text)
    
    index = find_noise_section(text.split("## "))
    if index!= -1:
        text = text.split("## ")[:index]
        return " ##".join(text)
    
    return text


# save file to parquet file
wiki_df = pd.read_parquet('./data/wiki_ml_mediawiki.parquet')
wiki_df.text = wiki_df.text.apply(clean_text)
wiki_df["id"] = wiki_df.index
wiki_df.to_parquet('./data/wiki_ml_mediawiki_cleaned.parquet')