# Web Scraping of Bosses and Cleaning

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import concurrent.futures
import re

In [None]:
# URL of the page to scrape
url = "https://www.mariowiki.com/List_of_bosses"

# Get the page content
response = requests.get(url)

# Controllare se la richiesta è andata a buon fine
if response.status_code == 200:
    # Analizzare l'HTML della pagina
    soup = BeautifulSoup(response.text, 'html.parser')

    # Cercare i personaggi nella lista
    items = soup.select("div.mw-parser-output ul li")

    # Creare le liste per il dataset
    bosses = []
    first_appearance = []

    # Estrarre i dati
    for item in items:
        text = item.text
        if '–' in text:  # Controlla la presenza del trattino
            name, appearance = text.split('–', 1)  # Dividi in due parti
            if '\n' in appearance:
                # Se ci sono più righe, prendi solo la prima
                appearance = appearance.split('\n', 1)[0]
            bosses.append(name.strip())  # Nome del personaggio
            first_appearance.append(appearance.strip())  # Prima apparizione

    # Creare un DataFrame
    df_bosses_first_appearance = pd.DataFrame({
        'Boss': bosses,
        'First Appearance': first_appearance
    })


In [None]:
# URL of the page to scrape
url = "https://www.mariowiki.com/List_of_bosses"

# Lists for the dataset
bosses = []
first_appearances = []

try:
    # Download the page
    response = requests.get(url)
    
    # Check if the request was successful
    response.raise_for_status()
    
    # Parse the HTML of the page
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the main section with the bosses (more specific)
    content = soup.select_one("div.mw-parser-output")
    
    if content:
        # Find the list items
        items = content.select("ul li")
        
        # Extract the data
        for item in items:
            text = item.text.strip()
            # Check if the text contains a dash (indicating name and appearance)
            # and split it into name and first appearance
            if '–' in text:  
                name, appearance = text.split('–', 1)
                if '\n' in appearance:
                    # If there are multiple lines, take only the first
                    appearance = appearance.split('\n', 1)[0]
                # Filter for valid elements, avoiding too long elements and empty strings
                if name and appearance and len(name) < 100:
                    bosses.append(name.strip())
                    first_appearances.append(appearance.strip())
    
    # Create a DataFrame
    df_bosses_first_appearance = pd.DataFrame({
        'Boss': bosses,
        'First Appearance': first_appearances
    })
    
    # Show a preview of the results
    print(f"✅ {len(bosses)} bosses found.")
    
except requests.exceptions.RequestException as e:
    print(f"❌ Error during HTTP request: {e}")
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ 756 bosses found.


In [68]:
# Stampa dataset
df_bosses_first_appearance

Unnamed: 0,Boss,First Appearance
0,Aerodent,Wario Land 4
1,Amadeus Wolfgeist,Luigi's Mansion 3
2,Ancient Poltergeist,Luigi's Mansion: Dark Moon
3,Anonster,Wario Land 3
4,Antasma,Mario & Luigi: Dream Team
...,...,...
751,Yellow Devil,Super Smash Bros. for Nintendo 3DS / Wii U
752,Zeekeeper,Mario & Luigi: Dream Team
753,Zeekeeper X,Mario & Luigi: Dream Team
754,Zombone,Super Mario RPG: Legend of the Seven Stars


In [None]:
# URL of the page to scrape: complete list of bosses
base_url = 'https://www.mariowiki.com/List_of_bosses'

# Final dictionary to hold all datasets
bosses = {}

# Sections of interest
sections = [
    ('Super Mario series', '#Super_Mario_series'),
    ('Mario Kart series', '#Mario_Kart_series'),
    ('Mario Party series', '#Mario_Party_series'),
    ('Mario vs. Donkey Kong series', '#Mario_vs._Donkey_Kong_series'),
    ('Mario & Luigi series', '#Mario_.26_Luigi_series'),
    ('Super Smash Bros. series', '#Super_Smash_Bros._series'),
    ("Luigi's Mansion series", "#Luigi.27s_Mansion_series"),
    ('Yoshi series', '#Yoshi_series'),
    ('Paper Mario series', '#Paper_Mario_series'),
    # New sections added
    ('Donkey Kong series', '#Donkey_Kong_series'),
    ('Mario Bros. series', '#Mario_Bros._series'),
    ('Wrecking Crew series', '#Wrecking_Crew_series'),
    ('Mario Golf series', '#Mario_Golf_series'),
    ('Mario Tennis series', '#Mario_Tennis_series'),
    ('Dr. Mario series', '#Dr._Mario_series'),
    ('Super Mario Stadium series', '#Super_Mario_Stadium_series'),
    ('Mario vs Donkey Kong series', '#Mario_vs._Donkey_Kong_series'),
    ('Mario & Sonic series', '#Mario_.26_Sonic_series'),
    # Add other series if needed, matching the href from the ToC on bosses pages
    ]

# Headers to simulate a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Filter for irrelevant links
def is_boss_link(href, title):
    if not href.startswith('/'):
        return False
    
    # Explicit exclusions of irrelevant page types
    exclusions = ['/File:', '/Category:', '/Help:', '/Special:', 
                 '/Template:', '/List_of_', '/Main_Page']
    
    for exclusion in exclusions:
        if href.startswith(exclusion):
            return False
    
    # Title should not be the title of the current page
    if title == 'List of bosses':
        return False
        
    return True

# Function to get links to individual bosses from the list
def get_boss_links(base_url):
    response = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Limit search to the main content section
    main_content = soup.select_one('div.mw-parser-output')
    if not main_content:
        return []
    
    # Find all links to individual bosses
    boss_links = []
    seen_links = set() # To avoid duplicates
    
    # Iterate over list items (<li>) within unordered lists (<ul>)
    # as bosses entries are typically in this structure.
    # The " – " separator is key to identifying bosses entries.
    list_items = main_content.select('ul li') 

    if not list_items:
        print(f"Warning: No <li> elements found within <ul> in 'mw-parser-output' on {base_url}. Check page structure or selectors.")

    for item_li in list_items:
        # Criterion 1: The list item's text should contain " – "
        # This helps filter for "Name – Description" style entries.
        if '–' not in item_li.get_text():
            continue 

        # Criterion 2: Find the primary link within this list item.
        # This is assumed to be the boss' own page link.
        # We look for an <a> tag with an href starting with '/', a title attribute,
        # and ensure it's likely the main subject of the list item.
        link_tag = item_li.find('a', href=re.compile(r'^/'), title=True)   

        if link_tag:
            href = link_tag['href']
            # Use the link's text as the boss name for consistency
            char_name = link_tag.get_text(strip=True) 
            
            # Avoid administrative links, ensure the name is not empty, and avoid duplicates
            # Added more common wiki administrative prefixes
            admin_prefixes = ('/Category:', '/File:', '/Help:', '/Special:', '/Template:', '/Mario_Wiki:', '/Talk:')
            if char_name and href not in seen_links and not href.startswith(admin_prefixes):
                full_url = 'https://www.mariowiki.com' + href
                boss_links.append(full_url)
                seen_links.add(href)

    if not boss_links:
        print(f"Warning: No bosses found on {base_url}.")
    else:
        print(f"🔎 Scraping for {len(boss_links)} bosses from {base_url}.")
    
    return boss_links

# Function that processes a single boss page
def process_boss_page(url):
    try:
        # Short pause to avoid overwhelming the server and being blocked for too many requests
        time.sleep(0.1)
        
        # Request the boss page
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return None, None
            
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the page title (boss name)
        title_element = soup.find('span', class_='mw-page-title-main')
        if not title_element:
            return None, None
            
        page_title = title_element.text
        
        # Dictionary to collect game titles for each section
        games_dict = {section_name: [] for section_name, _ in sections}
        
        # Extract the table of contents once
        toc = soup.find('div', {'id': 'toc'})
        if not toc:
            return page_title, pd.DataFrame(games_dict)
            
        # Loop to extract titles from the various sections more efficiently
        for section_name, section_link in sections:
            section_id = section_link.lstrip('#')
            
            # Find the section in the index
            section_li = None
            for li in toc.find_all('li', class_='toclevel-2'):
                if section_id in str(li):
                    section_li = li
                    break
                    
            if section_li:
                # Extract games subsections
                game_entries = section_li.find_all('li', class_='toclevel-3')
                for entry in game_entries:
                    game_title_element = entry.find('span', class_='toctext')
                    if game_title_element:
                        games_dict[section_name].append(game_title_element.text)

        # Create a DataFrame
        df = pd.DataFrame({k: pd.Series(v) for k, v in games_dict.items()})
        
        return page_title, df
        
    except Exception as e:
        # In case of error, return None
        return None, None

# General function with parallel execution
def scrape_bosses(base_url):
    
    print("Obtaining links to boss pages...")
    # Get all links to boss pages
    boss_links = get_boss_links(base_url)
    print(f"{len(boss_links)} links to potential boss pages found.")
    
    results = {}
    
    # Using parallel execution to speed up the process
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all jobs
        future_to_url = {executor.submit(process_boss_page, url): url for url in boss_links}
        
        # Process the results with a progress bar
        for future in tqdm(concurrent.futures.as_completed(future_to_url), 
                          total=len(boss_links)):
            url = future_to_url[future]
            try:
                page_title, df = future.result()
                if page_title and df is not None:
                    results[f"{page_title}_df"] = df
            except Exception as e:
                tqdm.write(f"Error with {url}: {str(e)[:100]}")
    
    # Remove empty dataframes
    bosses = {k: df for k, df in results.items() if not df.empty}
    
    print("\n--- Scraping Complete ---")
    print(f"✅ Successfully processed {len(bosses)}/{len(results)} bosses ({len(bosses) / len(results):.1%})")
    
    return bosses

# Run the scraping function
bosses = scrape_bosses(base_url)

Obtaining links to boss pages...
🔎 Scraping for 749 bosses from https://www.mariowiki.com/List_of_bosses.
749 links to potential boss pages found.


100%|██████████| 749/749 [01:05<00:00, 11.43it/s]


--- Scraping Complete ---
✅ Successfully processed 93/703 bosses (13.2%)





In [70]:
# Create a list of keys to remove
keys_to_remove = []

# Iterate through the bosses dictionary to find empty dfs
for key, df in bosses.items():
    if df.empty:
        keys_to_remove.append(key)

# Remove empty dataframes from the dictionary
for key in keys_to_remove:
    del bosses[key]

# Print the sorted list of boss names
print("\n".join(sorted(bosses.keys())))


Antasma_df
Big Blooper_df
Big Boo (character)_df
Big Boo_df
Big Buzzy Beetle_df
Big Chain Chomp_df
Big Cheep Cheep_df
Big Fire Piranha_df
Big Lava Bubble_df
Big Monty Mole_df
Big Spiny_df
Birdo_df
Black Shy Guy_df
Blooper_df
Boo Buddies_df
Boolossus_df
Boom Boom_df
Boomerang Bro_df
Boss Sumo Bro_df
Bowser Jr._df
Bowser Statue_df
Bowser's Castle_df
Cackletta_df
Chain Chomp_df
Chargin' Chuck_df
Cheep Cheep_df
Cheep Chomp_df
Clawgrip_df
Diddy Kong_df
Dino Piranha_df
Don Bongo_df
Dry Bones_df
Dry Bowser_df
Fawful_df
Fire Snake_df
Foreman Spike_df
Fracktail_df
Freezie_df
Fryguy_df
Giant Bowser_df
Giga Bowser_df
Goomba Tower_df
Goomboss_df
Gooper Blooper_df
Hammer Bro_df
Iggy_df
Impostor Bowser_df
Jr. Troopa_df
Kammy Koopa_df
King Bob-omb_df
King Boo_df
King K. Rool_df
Lakilester_df
Lakitu_df
Larry_df
Lemmy_df
Link_df
Ludwig_df
Magikoopa_df
Mechakoopa_df
Mega Dry Bones_df
Mega Goomba_df
Mega Rocky Wrench_df
Meowser_df
Metal Mario_df
Monty Mole_df
Morton_df
Mouser_df
Mr. Blizzard_df
Petey Pir

In [71]:
# Get a specific boss dataframe
bosses['Wiggler_df']

Unnamed: 0,Super Mario series,Mario Kart series,Mario Party series,Mario vs. Donkey Kong series,Mario & Luigi series,Super Smash Bros. series,Luigi's Mansion series,Yoshi series,Paper Mario series,Donkey Kong series,Mario Bros. series,Wrecking Crew series,Mario Golf series,Mario Tennis series,Dr. Mario series,Super Mario Stadium series,Mario vs Donkey Kong series,Mario & Sonic series
0,Super Mario World / Super Mario World: Super M...,Mario Kart: Double Dash!!,Mario Party 2,,Mario & Luigi: Superstar Saga / Mario & Luigi:...,,,,,,,,,,,,,
1,Super Mario 64 / Super Mario 64 DS,Mario Kart DS,Mario Party 3,,Mario & Luigi: Partners in Time,,,,,,,,,,,,,
2,Super Mario Sunshine,Mario Kart Wii,Mario Party 5,,Mario & Luigi: Bowser's Inside Story / Mario &...,,,,,,,,,,,,,
3,Super Mario Advance 4: Super Mario Bros. 3,Mario Kart 7,Mario Party 6,,Mario & Luigi: Dream Team,,,,,,,,,,,,,
4,New Super Mario Bros.,Mario Kart 8 / Mario Kart 8 Deluxe,Mario Party 7,,Mario & Luigi: Paper Jam,,,,,,,,,,,,,
5,Super Mario Galaxy,Mario Kart Tour,Mario Party 8,,Mario & Luigi: Brothership,,,,,,,,,,,,,
6,New Super Mario Bros. Wii,Mario Kart World,Mario Party DS,,,,,,,,,,,,,,,
7,Super Mario Galaxy 2,,Mario Party 9,,,,,,,,,,,,,,,
8,New Super Mario Bros. 2,,Mario Party: Island Tour,,,,,,,,,,,,,,,
9,New Super Mario Bros. U / New Super Luigi U / ...,,Mario Party 10,,,,,,,,,,,,,,,


In [72]:
# Function to reshape a DataFrame with multiple columns into a single 'Title' column
def reshape_dataframe(df):
    return pd.DataFrame({'Title': df.values.flatten()}).dropna().reset_index(drop = True)

# Apply the reshape function to each boss DataFrame
bosses_transformed = {key: reshape_dataframe(df) for key, df in bosses.items()}

In [73]:
# Get a specific boss dataframe
bosses_transformed['Wiggler_df']

Unnamed: 0,Title
0,Super Mario World / Super Mario World: Super M...
1,Mario Kart: Double Dash!!
2,Mario Party 2
3,Mario & Luigi: Superstar Saga / Mario & Luigi:...
4,Super Mario 64 / Super Mario 64 DS
5,Mario Kart DS
6,Mario Party 3
7,Mario & Luigi: Partners in Time
8,Super Mario Sunshine
9,Mario Kart Wii


In [74]:
# List to store the rows for the final DataFrame
rows = []

for boss_key in bosses_transformed:
    if boss_key.endswith('_df'):
        boss_name = boss_key.replace('_df', '')
        games = bosses_transformed[boss_key]['Title'].tolist() 
        
        for game in games:
            rows.append({
                'Boss': boss_name,
                'Game': game,
                'Relation': 'BOSS_IN'
            })

# Create the final DataFrame
bosses_df = pd.DataFrame(rows)

### Extracting the Species

In [None]:
# --- Function to extract Species from a character page ---
def extract_species_from_infobox(character_url, session):
    """
    Finds the infobox on a character page and robustly extracts a list of species,
    handling multiple entries and cleaning the text.
    """
    try:
        response = session.get(character_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        infobox = soup.find('table', class_='infobox')
        if not infobox:
            infobox = soup.find('table', style=lambda s: s and 'infobox' in s.lower())
        if not infobox:
            return None

        # Find the header row for 'Species', 'Type', or 'Race'
        species_header = infobox.find(lambda tag: tag.name in ['th', 'td'] and re.search(r'^(Species|Type|Race)$', tag.get_text(strip=True), re.IGNORECASE))
        
        if not species_header:
            return None

        # The species data is in the next table cell (td)
        species_cell = species_header.find_next_sibling('td')
        if not species_cell:
            return None
        
        # Replace <br> tags with a comma to handle line breaks as separators
        for br in species_cell.find_all("br"):
            br.replace_with(", ")

        # Get all text, which now includes commas for line breaks
        full_text = species_cell.get_text(separator=' ').strip()
        
        # Clean the text: remove parentheticals and citation brackets (e.g., [1])
        cleaned_text = re.sub(r'\s*\([^)]*\)', '', full_text)
        cleaned_text = re.sub(r'\s*\[[^\]]*\]', '', cleaned_text)
        
        # Split by common delimiters (comma, slash) and clean up each item
        species_list = [s.strip() for s in re.split(r',|/', cleaned_text) if s.strip()]
        
        return species_list if species_list else None

    except Exception:
        return None

# --- Prepare character URLs ---
base_url = "https://www.mariowiki.com"
unique_characters = bosses_df['Boss'].unique()
character_urls = [
    f"{base_url}/{name.replace(' ', '_').replace('&', '%26')}"
    for name in unique_characters
]

# --- Parallel scraping using ThreadPoolExecutor ---
def get_species_for_character(character_url):
    with requests.Session() as session:
        session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
        })
        # The function now returns a list of species or None
        return extract_species_from_infobox(character_url, session)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # The results will be a list of lists (e.g., [['Human'], ['Koopa', 'Dragon'], None, ...])
    results = list(tqdm(
        executor.map(get_species_for_character, character_urls),
        total=len(character_urls),
        desc="Extracting Species"
    ))


# --- Add the Species column to the DataFrame ---
# For Neo4j: it is a common way to store this as a semicolon-delimited string in the CSV.
# We handle None cases by turning them into an empty string.
# Create a dictionary mapping each unique character to their species
species_map = {name: ';'.join(s) if s is not None else '' for name, s in zip(unique_characters, results)}

# Use the map to efficiently add the 'Species' column to the original DataFrame
bosses_df['Species'] = bosses_df['Boss'].map(species_map)

Extracting Species:   0%|          | 0/93 [00:00<?, ?it/s]

Extracting Species: 100%|██████████| 93/93 [00:19<00:00,  4.82it/s]


In [None]:
# Save the DataFrame to a CSV file
bosses_df.to_csv('data/bosses_df.csv', index=False)
print("✅ Data saved to 'bosses_df.csv'")
# Display the final DataFrame
bosses_df

✅ Data saved to 'bosses_df.csv'


Unnamed: 0,Boss,Game,Relation,Species
0,Antasma,Mario & Luigi: Dream Team,BOSS_IN,Bat
1,Antasma,Mario & Luigi: Paper Jam,BOSS_IN,Bat
2,Black Shy Guy,Mario Kart DS,BOSS_IN,
3,Black Shy Guy,Paper Mario,BOSS_IN,
4,Black Shy Guy,Mario Kart 8 / Mario Kart 8 Deluxe,BOSS_IN,
...,...,...,...,...
1317,Wiggler,Mario Party: The Top 100,BOSS_IN,
1318,Wiggler,Super Mario Maker 2,BOSS_IN,
1319,Wiggler,Super Mario Party,BOSS_IN,
1320,Wiggler,Super Mario Bros. Wonder,BOSS_IN,
