# Web Scraping of Characters and Games and Cleaning

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import concurrent.futures
import re

## Characters

In [None]:
# URL of the page to scrape
url = "https://www.mariowiki.com/List_of_characters"

# Lists for the dataset
character_list = []
first_appearances = []

try:
    # Download the page
    response = requests.get(url)
    
    # Check if the request was successful
    response.raise_for_status()
    
    # Parse the HTML of the page
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the main section with the characters (more specific)
    content = soup.select_one("div.mw-parser-output")
    
    if content:
        # Find the list items
        items = content.select("ul li")
        
        # Extract the data
        for item in items:
            text = item.text.strip()
            match = text.split('–', 1)
            
            if len(match) == 2:
                name = match[0].strip()
                appearance = match[1].strip()
                
                # Filter for valid elements (optional)
                if name and appearance and len(name) < 100:  # Avoid elements that are too long
                    character_list.append(name)
                    first_appearances.append(appearance)
    
    # Create a DataFrame
    df_character_first_appearance = pd.DataFrame({
        'Character': character_list,
        'First Appearance': first_appearances
    })
    
    # Show a preview of the results
    print(f"✅ {len(df_character_first_appearance)} characters found.")
    
except requests.exceptions.RequestException as e:
    print(f"❌ Error during HTTP request: {e}")
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ 2817 characters found.


In [2]:
df_character_first_appearance

Unnamed: 0,Character,First Appearance
0,? Card Toad,Mario & Luigi: Paper Jam
1,13-Amp,WarioWare Gold
2,18-Volt,WarioWare: Twisted!
3,4.1,WarioWare: Touched!
4,4.2,WarioWare: Touched!
...,...,...
2812,Zero,The Super Mario Bros. Super Show!
2813,Zess T.,Paper Mario: The Thousand-Year Door
2814,Zip Toad,Paper Mario: The Thousand-Year Door
2815,Zokket,Mario & Luigi: Brothership


In [None]:
from urllib.parse import urljoin

# It's good practice to keep the session and headers global or pass them around
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
})

def find_game_appearances_table(soup):
    """
    Implements a multi-layered strategy to find the game appearances table.
    """
    # 1. Primary Method: Find by specific, known heading IDs. This is the most reliable.
    heading_ids = ["List_of_appearances", "List_of_appearances_by_date", "List_of_game_appearances_by_date", "List_of_game_appearances"]
    for hid in heading_ids:
        heading = soup.find('span', id=hid)
        if heading:
            # Find the next sibling that is a table. This is robust.
            table = heading.find_next(('table'), class_='wikitable')
            if table:
                return table

    # 2. Secondary Method: Find by headline text if IDs fail.
    # More flexible than the original: searches h2/h3/h4 and uses a more general regex.
    search_terms = re.compile(r'\blist of\s(Super Mario\s)?(game\s)?appearance(s)?(\s?by date)?\b', re.IGNORECASE)
    for header in soup.find_all(['h2', 'h3', 'h4']):
        if header.span and search_terms.search(header.span.get_text()):
            table = header.find_next_sibling('table', class_='wikitable')
            if table:
                return table

    for table in soup.find_all('table', class_='wikitable'):
        headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
        # If a table has these columns, it's very likely the one we want.
        if 'title' in headers and ('system' in headers or 'console' in headers or 'format' in headers ) \
            and ('year' in headers or 'release' in headers or 'date' in headers):
            return table
            
    return None

def parse_game_table(table, character_name):
    """
    Manually parses the BeautifulSoup table object into a list of dictionaries.
    This is more robust than pd.read_html.
    """
    try:
        headers_raw = [th.get_text(strip=True) for th in table.find_all('th')]
        
        # --- Flexible Column Mapping ---
        # Map potential header names to our desired canonical names.
        header_map = {}
        for h in headers_raw:
            h_lower = h.lower()
            if 'title' in h_lower: header_map[h] = 'Title'
            elif 'system' in h_lower or 'console' in h_lower: header_map[h] = 'Console'
            elif 'year' in h_lower or 'release' in h_lower or 'date' in h_lower: header_map[h] = 'Year'
            elif 'description' in h_lower or 'role' in h_lower: header_map[h] = 'Role'
        
        # If we didn't find essential columns, the table is probably wrong.
        if 'Title' not in header_map.values():
            return []
            
        game_data = []
        # Find the first row of data (sometimes it's not immediately after the header)
        tbody = table.find('tbody')
        if not tbody:
            return []

        for row in tbody.find_all('tr'):
            cells = row.find_all(['td', 'th'])
            if len(cells) >= len(header_map): # Ensure row has enough cells
                row_data = {
                    headers_raw[i]: re.sub(r'\s*,\s*', ', ', cell.get_text(separator=" ", strip=True))
                    for i, cell in enumerate(cells) if i < len(headers_raw)
                }

                # Check if the row contains a field equal to the header 'Title' and skip it if it does
                if 'Title' in row_data and row_data['Title'] == 'Title':
                    continue
                
                # Build a clean dictionary using our canonical names
                game_info = {
                    'Character': character_name,
                    'Title': row_data.get(next((k for k,v in header_map.items() if v == 'Title'), ''), 'N/A'),
                    'Console': row_data.get(next((k for k,v in header_map.items() if v == 'Console'), ''), 'N/A'),
                    'Year': row_data.get(next((k for k,v in header_map.items() if v == 'Year'), ''), 'N/A'),
                    'Role': row_data.get(next((k for k,v in header_map.items() if v == 'Role'), ''), 'N/A')
                }
                # Only add rows that have a title
                if game_info['Title'] != 'N/A' and game_info['Title']:
                    game_data.append(game_info)
                    
        return game_data
    except Exception as e:
        tqdm.write(f"⚠️  Error parsing table for {character_name}: {e}")
        return []

def process_character(character_info, base_url):
    """
    Worker function for each thread. Fetches page, finds table, and parses it.
    """
    character_name = character_info['name']
    character_url = urljoin(base_url, character_info['url'])
    
    try:
        time.sleep(0.1) # Gentle rate-limiting
        response = session.get(character_url, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Use the function to find the table
        games_table = find_game_appearances_table(soup)
        
        if games_table:
            # Use the new robust parsing function
            return parse_game_table(games_table, character_name)
        return None
    except requests.exceptions.RequestException as e:
        #tqdm.write(f"❌ Request error for {character_name} ({character_url}): {e}")
        return None
    except Exception as e:
        tqdm.write(f"❌ Unexpected error for {character_name} ({character_url}): {e}")
        return None


def scrape_mario_wiki():
    """
    Main orchestrator function.
    """
    list_url = "https://www.mariowiki.com/List_of_characters"
    
    # --- Step 1: Get the list of characters more reliably ---
    try:
        response = session.get(list_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        content = soup.find('div', class_='mw-parser-output')
        if not content:
            print(f"FATAL: Could not find main content div 'mw-parser-output' on {list_url}.")
            return None, None

        characters_to_scrape = []
        seen_urls = set()

        # Iterate over list items (<li>) within unordered lists (<ul>)
        # as character entries are typically in this structure.
        # The " – " separator is key to identifying character entries.
        list_items = content.select('ul li') 

        if not list_items:
            print(f"Warning: No <li> elements found within <ul> in 'mw-parser-output' on {list_url}. Check page structure or selectors.")

        for item_li in list_items:
            # Criterion 1: The list item's text should contain " – "
            # This helps filter for "Name – Description" style entries.
            if '–' not in item_li.get_text():
                continue 

            # Criterion 2: Find the primary link within this list item.
            # This is assumed to be the character's own page link.
            # We look for an <a> tag with an href starting with '/', a title attribute,
            # and ensure it's likely the main subject of the list item.
            link_tag = item_li.find('a', href=re.compile(r'^/'), title=True) 

            if link_tag:
                href = link_tag['href']
                # Use the link's text as the character name for consistency
                char_name = link_tag.get_text(strip=True) 
                
                # Avoid administrative links, ensure the name is not empty, and avoid duplicates
                # Added more common wiki administrative prefixes
                admin_prefixes = ('/Category:', '/File:', '/Help:', '/Special:', '/Template:', '/Mario_Wiki:', '/Talk:')
                if char_name and href not in seen_urls and not href.startswith(admin_prefixes):
                    characters_to_scrape.append({'name': char_name, 'url': href})
                    seen_urls.add(href)
        
        if not characters_to_scrape:
            print(f"Warning: No characters found matching all criteria on {list_url}. Review parsing logic or page structure.")
        else:
            print(f"🔎 Scraping for {len(characters_to_scrape)} characters from {list_url}.")
            
    except requests.exceptions.RequestException as e:
        print(f"FATAL: Could not fetch main character list from {list_url}: {e}")
        return None, None
    except Exception as e:
        print(f"FATAL: An unexpected error occurred during character list scraping: {e}")
        return None, None


    # --- Step 2: Concurrently scrape each character page ---
    all_games = []
    failed_characters = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        # Map futures to characters to track results
        future_to_char = {executor.submit(process_character, char_info, list_url): char_info for char_info in characters_to_scrape}
        
        progress = tqdm(concurrent.futures.as_completed(future_to_char), total=len(characters_to_scrape))
        
        for future in progress:
            char_info = future_to_char[future]
            result = future.result()
            
            if result:
                all_games.extend(result)
                #tqdm.write(f"✅ Found {len(result)} games for {char_info['name']}")
            else:
                failed_characters.append(char_info['name'])
                # tqdm.write(f"❌ No valid table found for {char_info['name']}")

    # --- Step 3: Final report and DataFrame creation ---
    total_chars = len(characters_to_scrape)
    success_count = total_chars - len(failed_characters)
    
    print("\n--- Scraping Complete ---")
    print(f"✅ Successfully processed {success_count}/{total_chars} characters ({success_count/total_chars:.1%})")
    #if failed_characters:
    #    print(f"❌ Failed to find tables for: {', '.join(failed_characters[:10])}...")

    df = pd.DataFrame(all_games)

    # Create a DataFrame for failed characters
    failed_df = pd.DataFrame(failed_characters, columns=['Failed Characters'])

    return df, failed_df


In [4]:
# --- Execute the scraping ---
characters_df, failed = scrape_mario_wiki()

🔎 Scraping for 2739 characters from https://www.mariowiki.com/List_of_characters.


100%|██████████| 2739/2739 [02:26<00:00, 18.72it/s]


--- Scraping Complete ---
✅ Successfully processed 111/2739 characters (4.1%)





In [5]:
# Rename 'Title' column to 'Game' and add column 'Relation' equal to 'CHARACTER_IN'
characters_df.rename(columns={'Title': 'Game'}, inplace=True)
characters_df['Relation'] = 'CHARACTER_IN'
characters_df = characters_df[['Character', 'Game', 'Relation']].reset_index(drop=True)
characters_df

Unnamed: 0,Character,Game,Relation
0,18-Volt,WarioWare: Twisted!,CHARACTER_IN
1,18-Volt,WarioWare: Twisted!,CHARACTER_IN
2,18-Volt,WarioWare: Smooth Moves,CHARACTER_IN
3,18-Volt,Super Smash Bros. Brawl,CHARACTER_IN
4,18-Volt,WarioWare: D.I.Y. Showcase,CHARACTER_IN
...,...,...,...
3323,Yoshi,Super Mario Bros. Wonder,CHARACTER_IN
3324,Yoshi,WarioWare: Move It!,CHARACTER_IN
3325,Yoshi,Super Mario RPG,CHARACTER_IN
3326,Yoshi,Super Mario Party Jamboree,CHARACTER_IN


### Extracting the Species

In [None]:
# --- Function to extract Species from a character page ---
def extract_species_from_infobox(character_url, session):
    """
    Finds the infobox on a character page and robustly extracts a list of species,
    handling multiple entries and cleaning the text.
    """
    try:
        response = session.get(character_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        infobox = soup.find('table', class_='infobox')
        if not infobox:
            infobox = soup.find('table', style=lambda s: s and 'infobox' in s.lower())
        if not infobox:
            return None

        # Find the header row for 'Species', 'Type', or 'Race'
        species_header = infobox.find(lambda tag: tag.name in ['th', 'td'] and re.search(r'^(Species|Type|Race)$', tag.get_text(strip=True), re.IGNORECASE))
        
        if not species_header:
            return None

        # The species data is in the next table cell (td)
        species_cell = species_header.find_next_sibling('td')
        if not species_cell:
            return None
        
        # Replace <br> tags with a comma to handle line breaks as separators
        for br in species_cell.find_all("br"):
            br.replace_with(", ")

        # Get all text, which now includes commas for line breaks
        full_text = species_cell.get_text(separator=' ').strip()
        
        # Clean the text: remove parentheticals and citation brackets (e.g., [1])
        cleaned_text = re.sub(r'\s*\([^)]*\)', '', full_text)
        cleaned_text = re.sub(r'\s*\[[^\]]*\]', '', cleaned_text)
        
        # Split by common delimiters (comma, slash) and clean up each item
        species_list = [s.strip() for s in re.split(r',|/', cleaned_text) if s.strip()]
        
        return species_list if species_list else None

    except Exception:
        return None

# --- Prepare character URLs ---
base_url = "https://www.mariowiki.com"
unique_characters = characters_df['Character'].unique()
character_urls = [
    f"{base_url}/{name.replace(' ', '_').replace('&', '%26')}"
    for name in unique_characters
]

# --- Parallel scraping using ThreadPoolExecutor ---
def get_species_for_character(character_url):
    with requests.Session() as session:
        session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
        })
        # The function now returns a list of species or None
        return extract_species_from_infobox(character_url, session)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # The results will be a list of lists (e.g., [['Human'], ['Koopa', 'Dragon'], None, ...])
    results = list(tqdm(
        executor.map(get_species_for_character, character_urls),
        total=len(character_urls),
        desc="Extracting Species"
    ))


# --- Add the Species column to your DataFrame ---
# For Neo4j: it is a common way to store this as a semicolon-delimited string in the CSV.
# We handle None cases by turning them into an empty string.
# Create a dictionary mapping each unique character to their species
species_map = {name: ';'.join(s) if s is not None else '' for name, s in zip(unique_characters, results)}

# Use the map to efficiently add the 'Species' column to the original DataFrame
characters_df['Species'] = characters_df['Character'].map(species_map)

Extracting Species:   0%|          | 0/111 [00:00<?, ?it/s]

Extracting Species: 100%|██████████| 111/111 [00:21<00:00,  5.19it/s]


In [48]:
characters_df

Unnamed: 0,Character,Game,Relation,Species
0,5-Volt,"WarioWare, Inc.: Mega Microgame$!",CHARACTER_IN,Human
1,5-Volt,WarioWare: Twisted!,CHARACTER_IN,Human
2,5-Volt,WarioWare: Touched!,CHARACTER_IN,Human
3,5-Volt,Game & Wario,CHARACTER_IN,Human
4,5-Volt,Super Smash Bros. for Wii U,CHARACTER_IN,Human
...,...,...,...,...
3323,Yoshi,Super Mario Bros. Wonder,CHARACTER_IN,Yoshi
3324,Yoshi,WarioWare: Move It!,CHARACTER_IN,Yoshi
3325,Yoshi,Super Mario RPG,CHARACTER_IN,Yoshi
3326,Yoshi,Super Mario Party Jamboree,CHARACTER_IN,Yoshi


In [None]:
# Save the DataFrame to a CSV file
characters_df.to_csv('data/characters_df.csv', index=False)
print("✅ Data saved to 'characters_df.csv'")
# Display the final DataFrame
characters_df

✅ Data saved to 'characters_df.csv'


Unnamed: 0,Character,Game,Relation,Species
0,5-Volt,"WarioWare, Inc.: Mega Microgame$!",CHARACTER_IN,Human
1,5-Volt,WarioWare: Twisted!,CHARACTER_IN,Human
2,5-Volt,WarioWare: Touched!,CHARACTER_IN,Human
3,5-Volt,Game & Wario,CHARACTER_IN,Human
4,5-Volt,Super Smash Bros. for Wii U,CHARACTER_IN,Human
...,...,...,...,...
3323,Yoshi,Super Mario Bros. Wonder,CHARACTER_IN,Yoshi
3324,Yoshi,WarioWare: Move It!,CHARACTER_IN,Yoshi
3325,Yoshi,Super Mario RPG,CHARACTER_IN,Yoshi
3326,Yoshi,Super Mario Party Jamboree,CHARACTER_IN,Yoshi


## Games

In [None]:
def scrape_mario_games_by_date():
    """
    Scrape the list of Mario games by date from the Mario Wiki
    
    Returns:
        DataFrame: Contains game titles, release dates, and systems/consoles
    """
    # URL for the list of games by date
    url = "https://www.mariowiki.com/List_of_games_by_date"
    
    # User agent to mimic a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }
    
    # Create a session for better performance
    session = requests.Session()
    session.headers.update(headers)
    
    # Lists to store data
    game_titles = []
    release_dates = []
    systems = []
    
    try:
        # Fetch the page content
        print(f"🔎 Scraping games data from {url}")
        response = session.get(url, timeout=10)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all h2 tags (which contain the release dates)
        h2_tags = soup.select('h2')
        
        # Process each date section
        for h2 in tqdm(h2_tags, desc="Processing date sections"):
            # Skip non-date headers
            if not h2.span or not h2.span.get('id'):
                continue
                
            # Extract the date from the section heading
            date_id = h2.span.get('id')
            
            # Skip non-date sections like "References"
            if date_id in ["References", "Navigation"]:
                continue
                
            # Try to parse the date
            release_date = date_id
            
            # Find the unordered list following this heading
            game_list = h2.find_next('ul')
            
            if not game_list:
                continue
                
            # Process each game in the list
            for item in game_list.find_all('li'):
                text = item.get_text().strip()
                
                # Skip empty items
                if not text:
                    continue
                    
                # Split the game title and system/console
                #parts = re.split(r'\s+-\s+', text, maxsplit=1)
                parts = text.rsplit(' - ', 1)

                if len(parts) == 2:
                    # Remove any extra parentheses from the game title
                    game_title = re.sub(r'\s*\(.*?\)\s*', '', parts[0].strip())
                    system = parts[1].strip()

                    if system == "Nintendo Switch 2 (UPCOMING)":
                        system = "Nintendo Switch 2"
                    
                    # Add data to our lists
                    game_titles.append(game_title)
                    release_dates.append(release_date)
                    systems.append(system)
                elif len(parts) == 1 and text:
                    # Some entries might not have a system specified
                    game_titles.append(text)
                    release_dates.append(release_date)
                    systems.append("N/A")
            
            # Add a small delay between processing date sections to be nice to the server
            time.sleep(0.2)
        
        # Create a DataFrame
        df_games = pd.DataFrame({
            'Game': game_titles,
            'Year': release_dates,
            'Console': systems
        })
        
        # Sort by release date (chronologically)
        df_games['Parsed Date'] = pd.to_datetime(df_games['Year'], errors='coerce')
        df_games = df_games.sort_values('Parsed Date').reset_index(drop=True)
        # Drop the temporary 'Parsed Date' column
        df_games = df_games.drop('Parsed Date', axis=1)
        
        print(f"✅ Successfully obtained {len(df_games)} games.")
        return df_games
        
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching page: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ Error processing data: {e}")
        return pd.DataFrame()

# Execute the scraping function
games_df = scrape_mario_games_by_date()

🔎 Scraping games data from https://www.mariowiki.com/List_of_games_by_date


Processing date sections: 100%|██████████| 46/46 [00:09<00:00,  5.07it/s]

✅ Successfully obtained 428 games.





In [None]:

# Save the DataFrame to a CSV file
games_df.to_csv('data/games_data.csv', index=False)
print("✅ Data saved to 'games_data.csv'")
# Print the DataFrame
games_df

Unnamed: 0,Game,Year,Console
0,Donkey Kong,1981,Arcade
1,Crazy Kong,1981,Arcade
2,Crazy Kong Part II,1981,Arcade
3,Green House,1982,Game & Watch
4,Donkey Kong,1982,Intellivision
...,...,...,...
423,Super Mario Party Jamboree,2024,Nintendo Switch
424,Mario & Luigi: Brothership,2024,Nintendo Switch
425,Mario Kart World,2025,Nintendo Switch 2
426,Donkey Kong Country Returns HD,2025,Nintendo Switch


## Video Game Sales dataset from Kaggle

In [None]:
# I want to extract the game titles from the df_mario_games dataframe and use them to check if they are available on "data/Video_Games.csv", which is a dataset containing various video games.
# Then, if they are available, I want to get them with all their information from the Video_Games dataset.
# The column is "Game" in the df_mario_games dataframe and "Name" in the Video_Games dataset.
def check_games_in_video_games_sales_dataset(df_mario_games):
    """
    Check if Mario games are available in the Video_Games dataset and return their information.
    
    Args:
        df_mario_games (DataFrame): DataFrame containing Mario games.
        
    Returns:
        DataFrame: Contains information of Mario games found in the Video_Games dataset.
    """
    # Load the Video_Games dataset
    try:
        df_video_games = pd.read_csv("data/Video_Games.csv")
        # Nice emoji print for the message
        print(f"📊 {len(df_video_games)} entries loaded successfully from the Video_Games dataset.")
    except FileNotFoundError:
        print("❌ Video_Games dataset not found.")
        return pd.DataFrame()
    
    # Define a list of keywords
    keywords = ["Mario", "Donkey Kong", "Donkey Konga", "Luigi", "Wario", "WarioWare", "Yoshi",
                "Toad", "Wrecking Crew", "Game & Watch", "Peach", "Super Smash Bros"]
    # Create a regex pattern by joining keywords with the OR operator '|'.
    # It also ensures that each keyword is matched as a whole word.
    # re.escape is used to ensure special characters in keywords (like '&') are treated literally
    pattern = '|'.join(rf'\b{re.escape(keyword)}\b' for keyword in keywords)
    
    # Check for matches in the 'Name' column and for pattern matches as extra check
    print("🔎 Checking for matching games in the Video_Games dataset...")
    matched_games = df_video_games[(df_video_games['Name'].isin(df_mario_games['Game'])) |
                                   (df_video_games['Name'].str.contains(pattern, case=False, na=False, regex=True))]
    
    if matched_games.empty:
        print("❌ No matching games found in the Video_Games dataset.")
    else:
        print(f"✅ {len(matched_games)} matching games found.")
    
    # Remove "index" column
    if 'index' in matched_games.columns:
        matched_games = matched_games.drop(columns=['index'])

    # Return sorted DataFrame with relevant columns
    return matched_games[['Name', 'Platform', 'Year_of_Release', 'Global_Sales']].reset_index(drop=True)


sales_data = check_games_in_video_games_sales_dataset(games_df)
sales_data

📊 16928 entries loaded successfully from the Video_Games dataset.
🔎 Checking for matching games in the Video_Games dataset...
✅ 223 matching games found.


Unnamed: 0,Name,Platform,Year_of_Release,Global_Sales
0,Super Mario Bros.,NES,1985.0,40.24
1,Mario Kart Wii,Wii,2008.0,35.52
2,Tetris,GB,1989.0,30.26
3,New Super Mario Bros.,DS,2006.0,29.80
4,New Super Mario Bros. Wii,Wii,2009.0,28.32
...,...,...,...,...
218,Super Smash Bros. Melee,GC,2001.0,7.07
219,Mario Kart: Double Dash!!,GC,2003.0,6.95
220,Donkey Kong Barrel Blast,Wii,2007.0,0.62
221,Mario's Picross,GB,1995.0,0.62


In [None]:
# Rename columns, clean up data, and convert types
sales_data = sales_data.rename(columns={
    'Global_Sales': 'Sales', 
    'Year_of_Release': 'Year', 
    'Name': 'Game'}).assign(
    Sales=lambda x: pd.to_numeric(x['Sales'], errors='coerce').round(2))

In [None]:
# Save the DataFrame to a CSV file
sales_data.to_csv('data/sales_data.csv', index=False)
print("✅ Data saved to 'sales_data.csv'")
# Display the cleaned DataFrame
sales_data

Unnamed: 0,Game,Platform,Year,Sales
0,Super Mario Bros.,NES,1985.0,40.24
1,Mario Kart Wii,Wii,2008.0,35.52
2,Tetris,GB,1989.0,30.26
3,New Super Mario Bros.,DS,2006.0,29.80
4,New Super Mario Bros. Wii,Wii,2009.0,28.32
...,...,...,...,...
218,Super Smash Bros. Melee,GC,2001.0,7.07
219,Mario Kart: Double Dash!!,GC,2003.0,6.95
220,Donkey Kong Barrel Blast,Wii,2007.0,0.62
221,Mario's Picross,GB,1995.0,0.62
