# Web Scraping of General Enemies and Cleaning

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import concurrent.futures
import re

In [None]:
# URL della pagina da cui fare scraping
url = "https://www.mariowiki.com/List_of_enemies"

# Scaricare la pagina
response = requests.get(url)

# Controllare se la richiesta è andata a buon fine
if response.status_code == 200:
    # Analizzare l'HTML della pagina
    soup = BeautifulSoup(response.text, 'html.parser')

    # Cercare i personaggi nella lista
    items = soup.select("div.mw-parser-output ul li")

    # Creare le liste per il dataset
    enemies = []
    first_appearance = []

    # Estrarre i dati
    for item in items:
        text = item.text
        if '–' in text:  # Controlla la presenza del trattino
            name, appearance = text.split('–', 1)  # Dividi in due parti
            enemies.append(name.strip())  # Nome del personaggio
            first_appearance.append(appearance.strip())  # Prima apparizione

    # Creare un DataFrame
    df_enemies_first_appearance = pd.DataFrame({
        'Enemy': enemies,
        'First Appearance': first_appearance
    })


In [2]:
# Stampa dataset
df_enemies_first_appearance

Unnamed: 0,Enemy,First Appearance
0,Accordion Goombas,Paper Mario: Color Splash
1,Accordion Guy,Paper Mario: Sticker Star
2,Ack,Donkey Kong Country Returns
3,AckStack,Donkey Kong Country Returns
4,Aero,Super Mario RPG: Legend of the Seven Stars
...,...,...
2327,Zombie,Wario Land II
2328,Zombie Debuho,Yoshi's Crafted World
2329,Zombie Guy,Yoshi's Crafted World
2330,Zombie Shroom,Super Paper Mario


In [None]:
# URL of the page to scrape: complete list of enemies
base_url = 'https://www.mariowiki.com/List_of_enemies'

# Final dictionary to hold all datasets
enemies = {}

# Sections of interest
sections = [
    ('Super Mario series', '#Super_Mario_series'),
    ('Mario Kart series', '#Mario_Kart_series'),
    ('Mario Party series', '#Mario_Party_series'),
    ('Mario vs. Donkey Kong series', '#Mario_vs._Donkey_Kong_series'),
    ('Mario & Luigi series', '#Mario_.26_Luigi_series'),
    ('Super Smash Bros. series', '#Super_Smash_Bros._series'),
    ("Luigi's Mansion series", "#Luigi.27s_Mansion_series"),
    ('Yoshi series', '#Yoshi_series'),
    ('Paper Mario series', '#Paper_Mario_series'),
    # New sections added
    ('Donkey Kong series', '#Donkey_Kong_series'),
    ('Mario Bros. series', '#Mario_Bros._series'),
    ('Wrecking Crew series', '#Wrecking_Crew_series'),
    ('Mario Golf series', '#Mario_Golf_series'),
    ('Mario Tennis series', '#Mario_Tennis_series'),
    ('Dr. Mario series', '#Dr._Mario_series'),
    ('Super Mario Stadium series', '#Super_Mario_Stadium_series'),
    ('Mario vs Donkey Kong series', '#Mario_vs._Donkey_Kong_series'),
    ('Mario & Sonic series', '#Mario_.26_Sonic_series'),
    # Add other series if needed, matching the href from the ToC on enemy pages
]

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Filter for irrelevant links
def is_enemy_link(href, title):
    if not href.startswith('/'):
        return False
    
    # Explicit exclusions of irrelevant page types
    exclusions = ['/File:', '/Category:', '/Help:', '/Special:', 
                 '/Template:', '/List_of_', '/Main_Page']
    
    for exclusion in exclusions:
        if href.startswith(exclusion):
            return False
    
    # Exclude links that are lists or categories based on the title
    if "List of" in title or "Category:" in title:
        return False

    # Title should not be the title of the current page
    if title == 'List of enemies':
        return False
        
    return True

# Function to get links to individual enemies from the list
def get_enemy_links(session, base_url):
    response = session.get(base_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
        
    # Limit search to the main content section
    main_content = soup.select_one('div.mw-parser-output')
    if not main_content:
        return []
    
    # Find all links to individual enemies
    enemy_links = []
    seen_links = set() # To avoid duplicates
    
    for link in main_content.find_all('a', href=True, title=True):
        href = link['href']
        title = link['title']
        
        if is_enemy_link(href, title) and href not in seen_links:
            full_url = 'https://www.mariowiki.com' + href
            enemy_links.append(full_url)
            seen_links.add(href)
    
    if not enemy_links:
        print(f"Warning: No enemies found on {base_url}.")
    else:
        print(f"🔎 Scraping for {len(enemy_links)} enemies from {base_url}.")

    return enemy_links

# Function that processes a single enemy page
def process_enemy_page(session, url):
    try:
        # Short pause to avoid overwhelming the server and being blocked for too many requests
        time.sleep(0.1)
        
        # Request the enemy page
        response = session.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return None, None
            
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the page title (enemy name) with robust fallbacks
        # First try to find the main title element
        title_element = soup.find('span', class_='mw-page-title-main')
        page_title = title_element.text.strip() if title_element else None
        # If the main title is not found, try to find an h1 tag with specific id
        if not page_title:
            h1_tag = soup.find('h1', id='firstHeading')
            page_title = h1_tag.text.strip() if h1_tag else None
        # If still no title, try to extract from the title tag
        if not page_title and soup.title:
            page_title = soup.title.string.replace(" - Super Mario Wiki, the Mario encyclopedia", "").strip()

        # If no title is found or if it is a search results page, return None
        # This is a common case where the page might not exist or is a search result
        if not page_title or page_title == "Search results":
            return None, None
            
        # Dictionary to collect game titles for each section
        games_dict = {section_name: [] for section_name, _ in sections}
        
        # Extract the table of contents once
        toc = soup.find('div', {'id': 'toc'})
        if not toc:
            return page_title, pd.DataFrame(games_dict)
            
        # Loop to extract titles from the various sections more efficiently
        for section_name, section_link in sections:
            # Find the section in the index using the section link
            toc_anchor = toc.find('a', class_='tocsectionlink', href=section_link)
            # If the anchor is not found, try to find it by href
            if not toc_anchor:
                all_toc_links = toc.select('a')
                for link in all_toc_links:
                    if link.get('href') == section_link:
                        toc_anchor = link
                        break
                    
            # If the anchor is found, proceed to extract the games
            if toc_anchor:
                # Find the parent list item to get the section
                section_li = toc_anchor.find_parent('li')
                if section_li:
                    # Extract games subsections
                    game_entries = section_li.find_all('li', class_='toclevel-3')
                    for entry in game_entries:
                        game_title_element = entry.find('span', class_='toctext')
                        if game_title_element:
                            games_dict[section_name].append(game_title_element.text.strip())

        # Create a DataFrame
        df = pd.DataFrame({k: pd.Series(v) for k, v in games_dict.items()})
        
        return page_title, df
        
    except Exception as e:
        # In case of error, return None
        return None, None

# General function with parallel execution
def scrape_enemies():
    
    print("Obtaining links to enemy pages...")
    # Use a session to maintain connection and headers
    with requests.Session() as session:
        enemy_links = get_enemy_links(session, base_url)
        print(f"{len(enemy_links)} links to potential enemy pages found.")
        
        results = {}
        
        # Using parallel execution to speed up the process
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            # Submit all jobs
            future_to_url = {executor.submit(process_enemy_page, session, url): url for url in enemy_links}
            
            # Process the results with a progress bar
            for future in tqdm(concurrent.futures.as_completed(future_to_url), 
                              total=len(enemy_links)):
                url = future_to_url[future]
                try:
                    page_title, df = future.result()
                    if page_title and df is not None:
                        results[f"{page_title}_df"] = df
                except Exception as e:
                    tqdm.write(f"Error with {url}: {str(e)[:100]}")
    
    # Remove empty dataframes
    enemies = {k: df for k, df in results.items() if not df.empty}
    
    print("\n--- Scraping Complete ---")
    print(f"✅ Successfully processed {len(enemies)}/{len(results)} enemies ({len(enemies) / len(results):.1%})")

    return enemies

# Run the scraping function
enemies = scrape_enemies()

Obtaining links to enemy pages...
🔎 Scraping for 2403 enemies from https://www.mariowiki.com/List_of_enemies.
2403 links to potential enemy pages found.


100%|██████████| 2403/2403 [02:26<00:00, 16.44it/s]


--- Scraping Complete ---
✅ Successfully processed 252/2333 enemies (10.8%)





In [4]:
# Create a list of keys to remove
keys_to_remove = []

# Iterate through the enemies dictionary to find empty dfs
for key, df in enemies.items():
    if df.empty:
        keys_to_remove.append(key)

# Remove empty dataframes from the dictionary
for key in keys_to_remove:
    del enemies[key]

# Print the sorted list of enemy names
print("\n".join(sorted(enemies.keys())))

Albatoss_df
Amazing Flyin' Hammer Brother_df
Amp_df
Angry Sun_df
Ant Trooper_df
Bandit_df
Banzai Bill Cannon_df
Bat (Super Mario Galaxy)_df
Bee_df
Biddybud_df
Big Blooper_df
Big Boo_df
Big Buzzy Beetle_df
Big Chain Chomp_df
Big Cheep Cheep_df
Big Fire Piranha_df
Big Koopa Paratroopa_df
Big Koopa Troopa_df
Big Lava Bubble_df
Big Monty Mole_df
Big Paragoomba_df
Big Piranha Plant_df
Big Pokey_df
Big Spiny_df
Big Tail Goomba_df
Big Thwomp_df
Bill Blaster_df
Bird (Donkey Kong series)_df
Black Shy Guy_df
Block Boo_df
Blooper Baby_df
Blooper Nanny_df
Blooper_df
Blurp_df
Bob-omb Car_df
Bob-omb_df
Bomb Boo_df
Bomber Bill_df
Bomp_df
Bone Goomba_df
Bone Piranha Plant_df
Bony Beetle_df
Boo Buddies_df
Boo_df
Boomerang Bro_df
Boulder_df
Bramball_df
Broozer_df
Broque Monsieur_df
Bulber_df
Bull's-Eye Banzai_df
Bull's-Eye Bill_df
Bull's-Eye Blaster_df
Bullet Bill_df
Bully_df
Bumpty_df
Butterfly_df
Buzzy Beetle_df
Candle_df
Cataquack_df
Chain Chomp_df
Chargin' Chuck_df
Charvaargh_df
Cheep Cheep_df
Cheep

In [5]:
# Get a specific enemy dataframe
enemies['Piranha Plant_df']

Unnamed: 0,Super Mario series,Mario Kart series,Mario Party series,Mario vs. Donkey Kong series,Mario & Luigi series,Super Smash Bros. series,Luigi's Mansion series,Yoshi series,Paper Mario series,Donkey Kong series,Mario Bros. series,Wrecking Crew series,Mario Golf series,Mario Tennis series,Dr. Mario series,Super Mario Stadium series,Mario vs Donkey Kong series,Mario & Sonic series
0,Super Mario Bros.,Super Mario Kart,Mario Party,,Mario & Luigi: Superstar Saga / Mario & Luigi:...,Super Smash Bros. Ultimate,,Super Mario World 2: Yoshi's Island / Yoshi's ...,Paper Mario,,,,,,,,,
1,Super Mario Bros.: The Lost Levels,Mario Kart 64,Mario Party 2,,Mario & Luigi: Partners in Time,,,Yoshi's Story,Paper Mario: The Thousand-Year Door,,,,,,,,,
2,Super Mario Bros. 3 / Super Mario Advance 4: S...,Mario Kart: Super Circuit,Mario Party 3,,Mario & Luigi: Bowser's Inside Story / Mario &...,,,Yoshi's Island DS,Super Paper Mario,,,,,,,,,
3,Super Mario Land,Mario Kart: Double Dash!!,Mario Party 4,,Mario & Luigi: Dream Team,,,Yoshi's New Island,Paper Mario: Sticker Star,,,,,,,,,
4,Super Mario World,Mario Kart Arcade GP series,Mario Party 5,,Mario & Luigi: Paper Jam,,,Yoshi's Woolly World / Poochy & Yoshi's Woolly...,Paper Mario: Color Splash,,,,,,,,,
5,Super Mario Land 2: 6 Golden Coins,Mario Kart DS,Mario Party 6,,Mario & Luigi: Brothership,,,Yoshi's Crafted World,Paper Mario: The Origami King,,,,,,,,,
6,Super Mario 64 / Super Mario 64 DS,Mario Kart Wii,Mario Party Advance,,,,,,,,,,,,,,,
7,Super Mario Sunshine,Mario Kart 7,Mario Party 7,,,,,,,,,,,,,,,
8,New Super Mario Bros.,Mario Kart 8 / Mario Kart 8 Deluxe,Mario Party 8,,,,,,,,,,,,,,,
9,Super Mario Galaxy,Mario Kart Tour,Mario Party DS,,,,,,,,,,,,,,,


In [6]:
# Function to reshape a DataFrame with multiple columns into a single 'Title' column
def reshape_dataframe(df):
    """Trasforma un DataFrame con più colonne in una sola colonna 'Title'."""
    return pd.DataFrame({'Title': df.values.flatten()}).dropna().reset_index(drop = True)

# Apply the reshape function to each enemy DataFrame
enemies_transformed = {key: reshape_dataframe(df) for key, df in enemies.items()}

In [7]:
# Get a specific enemy dataframe
enemies_transformed['Piranha Plant_df']

Unnamed: 0,Title
0,Super Mario Bros.
1,Super Mario Kart
2,Mario Party
3,Mario & Luigi: Superstar Saga / Mario & Luigi:...
4,Super Smash Bros. Ultimate
...,...
62,Mario Party Superstars
63,Super Mario Maker / Super Mario Maker for Nint...
64,Super Mario Run
65,Super Mario Odyssey


In [8]:
# List to store the rows for the final DataFrame
rows = []

for enemie_key in enemies_transformed:
    if enemie_key.endswith('_df'):
        enemie_name = enemie_key.replace('_df', '')
        games = enemies_transformed[enemie_key]['Title'].tolist()  
        
        for game in games:
            rows.append({
                'Enemy': enemie_name,
                'Game': game,
                'Relation': 'ENEMY_IN'
            })

# Create the final DataFrame
enemies_df = pd.DataFrame(rows)

In [9]:
# Get unique enemies
pd.DataFrame(enemies_df['Enemy'].unique(), columns=['Enemy'])

Unnamed: 0,Enemy
0,Albatoss
1,Amazing Flyin' Hammer Brother
2,Amp
3,Angry Sun
4,Ant Trooper
...,...
247,Wild Ptooie Piranha
248,Whomp
249,X-Naut
250,Wiggler


In [None]:
# Save the final DataFrame to a CSV file
enemies_df.to_csv('data/enemies_df.csv', index=False)
print("✅ Data saved to 'enemies_df.csv'")
# Display the final DataFrame
enemies_df

✅ Data saved to 'enemies_df.csv'


Unnamed: 0,Enemy,Game,Relation
0,Albatoss,Yume Kōjō: Doki Doki Panic / Super Mario Bros. 2,ENEMY_IN
1,Albatoss,Super Mario All-Stars / Super Mario Advance 4:...,ENEMY_IN
2,Amazing Flyin' Hammer Brother,Super Mario World / Super Mario World: Super M...,ENEMY_IN
3,Amazing Flyin' Hammer Brother,Super Mario Advance 4: Super Mario Bros. 3,ENEMY_IN
4,Amp,Super Mario 64 / Super Mario 64 DS,ENEMY_IN
...,...,...,...
2657,Wiggler,Super Mario Bros. Wonder,ENEMY_IN
2658,Wiggler,Super Mario Party Jamboree,ENEMY_IN
2659,Zinger,Super Smash Bros. Brawl,ENEMY_IN
2660,Zinger,Super Smash Bros. for Wii U,ENEMY_IN
