

# Part 1: Data Collection through Web Scraping


In [None]:
import requests
import time
from bs4 import BeautifulSoup
import json

In [None]:
# Fetch robots.txt
response = requests.get("https://pokemondb.net/robots.txt")
print(response.text)


User-agent: *
Disallow: /pokebase/search?
Disallow: /pokebase/revisions
Disallow: /pokebase/meta/search?
Disallow: /pokebase/meta/revisions
Disallow: /pokebase/rmt/search?
Disallow: /pokebase/rmt/revisions
Crawl-delay: 2

User-agent: Yandex
Crawl-delay: 30

User-agent: SindiceBot
Crawl-delay: 30

User-agent: CCBot
Crawl-Delay: 30

User-agent: wget
Disallow: /

User-agent: WebReaper
Disallow: /

User-agent: AhrefsBot
Disallow: /

Sitemap: https://pokemondb.net/static/sitemaps/pokemondb.xml
Sitemap: https://pokemondb.net/static/sitemaps/pokebase.xml
Sitemap: https://pokemondb.net/static/sitemaps/images.xml



In [None]:
# Get the main Pokémon data page
url = "https://pokemondb.net/pokedex/all"
page = requests.get(url)

# Ensure the request was successful
if page.status_code == 200:
    print("Page fetched successfully!")
else:
    print("Failed to retrieve the page.")


Page fetched successfully!


In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

# Find the table with Pokémon information
table = soup.find("table", {"id": "pokedex"})
pokemon_rows = table.find_all("tr")[1:]  # Skip header row

# Extract Pokémon names and individual URLs
pokemon_urls = []
for row in pokemon_rows:
    name_cell = row.find("td", {"class": "cell-name"})
    if name_cell:
        pokemon_name = name_cell.text.strip()
        pokemon_url = name_cell.find("a")["href"]
        full_url = f"https://pokemondb.net{pokemon_url}"
        pokemon_urls.append((pokemon_name, full_url))

In [6]:
all_pokemon_data = {}
# loop for fetching individual Pokémon pages
for name, url in pokemon_urls[:500]:
    pokemon_page = requests.get(url)
    pokemon_soup = BeautifulSoup(pokemon_page.content, 'html.parser')

    pokemon_data = {}

    # Add Pokémon name to the data
    pokemon_data["name"] = name

    # Extract specific containers containing vital tables
    target_classes = [
        "grid-col span-md-6 span-lg-4",
        "grid-col span-md-12 span-lg-4",
        "grid-col span-md-12 span-lg-8",
        "grid-col span-md-6 span-lg-12"
    ]
    vital_tables = []

    for cls in target_classes:
            containers = pokemon_soup.find_all('div', class_=cls)
            for container in containers:
                table = container.find('table', class_='vitals-table')
                if table:
                    vital_tables.append(table)

    for table in vital_tables:
      # Identify the table by its preceding <h2>
      header = table.find_previous('h2').text.strip() if table.find_previous('h2') else None

      if header == "Pokédex data":
        for row in table.find_all('tr'):
          th = row.find('th').text.strip()
          td = row.find('td').text.strip()
          if "Abilities" in th:
            pokemon_data["Abilities"] = [a.text.strip() for a in row.find_all('a')]
          elif "Local №" in th:
            continue
          else:
            pokemon_data[th] = td

      elif header == "Training":
        for row in table.find_all('tr'):
          th = row.find('th').text.strip()
          td = row.find('td').text.strip()
          pokemon_data[th] = td

      elif header == "Breeding":
        for row in table.find_all('tr'):
          th = row.find('th').text.strip()
          td = row.find('td').text.strip()
          pokemon_data[th] = td

      elif header == "Base stats":
        for row in table.find_all('tr'):
          th = row.find('th').text.strip()
          stats = row.find_all('td', class_='cell-num')
          if len(stats) >= 3:
            pokemon_data[f"{th}_lv1"] = stats[0].text.strip()
            pokemon_data[f"{th}_min"] = stats[1].text.strip()
            pokemon_data[f"{th}_max"] = stats[2].text.strip()
          else:
            pokemon_data[th] = stats[0].text.strip()

    # Extract evolution path
    evo_section = pokemon_soup.find('div', class_='infocard-list-evo')
    if evo_section:
        pokemon_data["evo_path"] = [evo.text.strip() for evo in evo_section.find_all('a', class_='ent-name')]

    # Extract moves
    moves_section = pokemon_soup.find_all('div', id='tab-moves-21')
    moves = []
    for section in moves_section:
      moves_table = section.find('table', class_='data-table')
      if moves_table:
        moves.extend([row.find('a').text.strip() for row in moves_table.find_all('tr') if row.find('a')])

    pokemon_data["moves"] = moves

    # Extract pixel images
    sprite_section = pokemon_soup.find_all('a', class_='sprite-share-link')
    pokemon_data["pixel_image_urls"] = [a['href'] for a in sprite_section if 'href' in a.attrs]

    # Add Pokémon data to the master dictionary
    all_pokemon_data[name] = pokemon_data

    output_file = "all_pokemon_data.json"
    with open(output_file, "w") as json_file:
        json.dump(all_pokemon_data, json_file, indent=4)

    time.sleep(4)