In [1]:
import vector_database
import embedding
import wikipediaapi
import pandas as pd

Titles of wikipedia articles that we want to have in our database:

In [2]:
def get_articles_from_list_page(list_page):
    """
    Fetch all full articles linked from a list page, excluding redirects and section links.
    
    Args:
        list_page (wikipediaapi.WikipediaPage): The list page to process.
        wiki (wikipediaapi.Wikipedia): The Wikipedia API object.
    
    Returns:
        list: A list of article titles linked from the list page.
    """
    linked_articles = []
    for link_title, link_page in list_page.links.items():
        if link_page.ns == 0 and "#" not in link_title:
            linked_articles.append(link_title)
    return linked_articles

def get_category_members(category_name, wiki, visited_categories=None):
    """
    Recursively fetch articles and subcategories from a Wikipedia category, 
    including list pages, excluding redirects and section links.
    
    Args:
        category_name (str): The name of the Wikipedia category.
        wiki (wikipediaapi.Wikipedia): The Wikipedia API object.
        visited_categories (set): Tracks visited categories to avoid duplicates and loops.
    
    Returns:
        list: A list of article titles in the category and its subcategories.
    """
    if visited_categories is None:
        visited_categories = set()

    # Avoid re-visiting categories
    if category_name in visited_categories:
        return []
    visited_categories.add(category_name)

    category_page = wiki.page(f"Category:{category_name}")
    if not category_page.exists():
        print(f"Category '{category_name}' does not exist.")
        return []

    articles = []
    subcategories = []
    list_pages = []

    # Iterate over category members
    for member in category_page.categorymembers.values():
        if member.ns == 0:  # Namespace 0 indicates an article
            if member.title.startswith("List of"):
                list_pages.append(member)  # Treat "List of..." pages separately
            elif "#" not in member.title:  # Exclude redirects and section links
                articles.append(member.title)
        elif member.ns == 14:  # Namespace 14 indicates a subcategory
            subcategories.append(member.title.replace("Category:", ""))

    # Process list pages to extract linked articles
    for list_page in list_pages:
        articles += get_articles_from_list_page(list_page)

    # Recursively process subcategories
    for subcategory in subcategories:
        articles += get_category_members(subcategory, wiki, visited_categories)

    return articles

In [6]:
# Initialize Wikipedia API
wiki = wikipediaapi.Wikipedia("NLP WUT 2024")  # English Wikipedia

# Starting category
start_category = "Star Wars"

print(f"Fetching all articles in the '{start_category}' category...")
all_articles = get_category_members(start_category, wiki)
print(f"Found {len(all_articles)} articles.")

Fetching all articles in the 'Star Wars' category...
Found 36960 articles.


In [8]:
all_articles = list(set(all_articles))
len(all_articles)

9396

In [9]:
# sometimes we get duds and/or duplicates, let's clean that
[article for article in all_articles if 'Joruus' in article]

["Joruus C'baoth",
 'Joruus',
 "Joruus c'baoth",
 "Joruus C'Baoth",
 'Joruus cbaoth']

In [10]:
# an interesting hack I found -- there must be a better way to do this! (JS)
page = wiki.page("Joruus c'baoth")
print(page.title)
page.summary
print(page.title)

Joruus c'baoth
List of Star Wars Legends characters


In [None]:
all_articles_true_titles = []
for i in range(len(all_articles)):
    if i % 1000 == 0:
        print(f"{i}/{len(all_articles)} articles processed...")
    page = wiki.page(all_articles[i])
    page.summary # calling summary makes it forget its a redirect
    all_articles_true_titles.append(page.title)

all_articles_true_titles = list(set(all_articles_true_titles))

In [84]:
len(all_articles_true_titles)

4944

In [None]:
pd.DataFrame({"page": all_articles_true_titles}).to_csv("allpages.csv")

In [3]:
page_titles = pd.read_csv("allpages.csv")['page']

In [13]:
def any_star_wars_category(page_title):
    for category in wiki.page(page_title).categories.keys():
        if 'star wars' in category.lower():
            return True

    return False

In [14]:
sw_page_titles = [page_title for page_title in page_titles if any_star_wars_category(page_title)]

In [15]:
len(sw_page_titles)

744

In [16]:
def get_text_from_metadata(metadata_singular):
    section = wiki.page(metadata_singular['page_title']).\
            section_by_title(metadata_singular['section_title'])
    if metadata_singular['subsection_title'] != "":
        text = section.section_by_title(metadata_singular['subsection_title']).full_text()
    else:
        text = section.text
    return text

Database name should end in "db", so then it is ignored by git:

In [17]:
database_name = "wiki_db"

This cell may take some time:

In [18]:
embeddings = []
metadata = []

# initialize empty - no folder exists yet
db = vector_database.VectorDatabaseWraper()

i = 0
for page_title in sw_page_titles:
    if i % 1000 == 0:
        print(f"{i}/{len(sw_page_titles)} articles processed...")
    i += 1
    page = wiki.page(page_title)
    page_title = page.title
    try:
        for section in page.sections:
            if section._level == 1:
                section_title = section.title
                metadata.append({
                    "page_title": page_title,
                    "section_title": section_title,
                    "subsection_title": ""
                })
            for subsection in section.sections:
                if subsection._level == 2:
                    subsection_title = subsection.title
                    metadata.append({
                        "page_title": page_title,
                        "section_title": section_title,
                        "subsection_title": subsection_title
                    })
    except:
        print(f"Error while transforming page {page_title}")
    

0/744 articles processed...
Error while transforming page List of Star Wars creatures


Check if subsections were loaded correctly. Expected output:
```python
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': ''}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Original trilogy'}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Prequel trilogy'}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Sequel trilogy'}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Star Wars Anthology'}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Star Wars: The Clone Wars'}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Star Wars Rebels'}
    {'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Obi-Wan Kenobi'}
    {'page_title': 'The Imperial March', 'section_title': 'Uses outside Star Wars', 'subsection_title': ''}
    {'page_title': 'The Imperial March', 'section_title': 'Inspiration and influences', 'subsection_title': ''}
    {'page_title': 'The Imperial March', 'section_title': 'See also', 'subsection_title': ''}
    {'page_title': 'The Imperial March', 'section_title': 'References', 'subsection_title': ''}
    {'page_title': 'The Imperial March', 'section_title': 'External links', 'subsection_title': ''}
```


In [22]:
for dic in metadata:
    if dic["page_title"] == sw_page_titles[2]:
        print(dic)

{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': ''}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Original trilogy'}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Prequel trilogy'}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Sequel trilogy'}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Star Wars Anthology'}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Star Wars: The Clone Wars'}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Star Wars Rebels'}
{'page_title': 'The Imperial March', 'section_title': 'Use in Star Wars', 'subsection_title': 'Obi-Wan Kenobi'}
{'page_title': 'The Imperial March', 'section_title': 'Uses outside Star Wars', 'subsection_title

In [23]:
i = 0
fail_count = 0
for metadata_singular in metadata:
    # check if the section is already in the databases
    if i % 1000 == 0:
        print(f"{i}/{len(metadata)} sections/subsections processed...")
    i += 1
    if db.has_record(metadata_singular):
        print(f"Skipping {metadata_singular}")
        continue
    try:
        section_text = get_text_from_metadata(metadata_singular)
        embeddings.extend(embedding.embedding([section_text]))
    except:
        print(f"Loading failed for {metadata_singular}")
        fail_count +=1

print(f"Failures: {fail_count}")

0/7806 sections/subsections processed...
Loading failed for {'page_title': 'Star Wars Miniatures Battles', 'section_title': 'Reception', 'subsection_title': 'Reviews'}
Loading failed for {'page_title': 'Red Letter Media', 'section_title': 'Web series', 'subsection_title': "Mr. Plinkett's Reviews (2008–2020)"}
Loading failed for {'page_title': 'Red Letter Media', 'section_title': 'Web series', 'subsection_title': 'Half in the Bag (2011–present)'}
Loading failed for {'page_title': 'Red Letter Media', 'section_title': 'Web series', 'subsection_title': 'Best of the Worst (2013–present)'}
Loading failed for {'page_title': 'Red Letter Media', 'section_title': 'Web series', 'subsection_title': 're:View (2016–present)'}
Loading failed for {'page_title': 'Red Letter Media', 'section_title': 'Web series', 'subsection_title': 'The Nerd Crew: A Pop Culture Podcast (2017–present)'}
Loading failed for {'page_title': 'Red Letter Media', 'section_title': 'Web series', 'subsection_title': 'Previously R

In [225]:
metadata_singular

{'page_title': 'List of Disney+ original programming',
 'section_title': 'Regional original programming',
 'subsection_title': 'Co-productions'}

In [24]:
db.add(embeddings, metadata)
db.save(database_name)