# Config

## Selenium + BS4

In [1]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import pandas as pd

import os

In [2]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

# Setup WebDriver
service = Service(chrome_driver_path)  # Use the path from environment variable

# Pokedex

## Scrape the main page

In [3]:
all_pokedex_url = "https://pokemondb.net/pokedex/all"

In [4]:
try:
    driver = webdriver.Chrome(service=service)
    driver.get(all_pokedex_url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
    
finally:
    # Close the browser
    driver.quit()

## Extract from Soup object

In [13]:
# Find the table body containing all Pokémon rows
table_body = soup.find('tbody')
rows = table_body.find_all('tr')  # Each row corresponds to one Pokémon

# List to store the extracted data
pokemon_data = []

# Iterate through each row
for row in rows:
    # Extract the columns
    cols = row.find_all('td')
    
    # Extract individual data points
    number = cols[0].find('span', class_='infocard-cell-data').text.strip()
    image_url = cols[0].find('img')['src']
    name = cols[1].find('a', class_='ent-name').text.strip()

    # Check for subtitle
    subtitle_tag = cols[1].find('small', class_='text-muted')
    subtitle = subtitle_tag.text.strip() if subtitle_tag else ""  # Extract text if present

    types = [t.text for t in cols[2].find_all('a')]  # Multiple types
    total = cols[3].text.strip()
    hp = cols[4].text.strip()
    attack = cols[5].text.strip()
    defense = cols[6].text.strip()
    sp_atk = cols[7].text.strip()
    sp_def = cols[8].text.strip()
    speed = cols[9].text.strip()
    
    # Append the data as a dictionary
    pokemon_data.append({
        "#": number,
        "Image URL": image_url,
        "Name": name,
        "Subtitle": subtitle,
        "Type": types,
        "Total": total,
        "HP": hp,
        "Attack": attack,
        "Defense": defense,
        "Sp. Atk": sp_atk,
        "Sp. Def": sp_def,
        "Speed": speed
    })

In [None]:
# Convert to pandas df
df = pd.DataFrame(pokemon_data)

In [20]:
# Define the relative path to the data folder
relative_path = os.path.join("data", "pokemon_main_stats_data.csv")

In [21]:
# Save the DataFrame to the relative path
df.to_csv(relative_path, index=False)

# Move List

## Scrape the move page

In [3]:
all_moves_url = "https://pokemondb.net/move/all"

In [None]:
try:
    driver = webdriver.Chrome(service=service)
    driver.get(all_moves_url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
    
finally:
    # Close the browser
    driver.quit()

In [None]:
# driver = webdriver.Chrome(service=service)
# driver.get(all_moves_url)

# soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
# driver.quit()

In [8]:
# Assuming `soup` contains the BeautifulSoup object of the webpage
moves = []

# Find all rows of the move table
rows = soup.find_all("tr")

for row in rows:
    # Extract move name
    move_name = row.find("td", class_="cell-name")
    if not move_name:  # Skip rows without a move name
        continue
    move_name = move_name.get_text(strip=True)
    
    # Extract type
    move_type = row.find("td", class_="cell-icon")
    move_type = move_type.a.get_text(strip=True) if move_type else None

    # Extract category
    category = row.find("td", class_="cell-icon text-center")
    category = category.img["title"] if category and category.img else None

    # Extract power
    power = row.find_all("td", class_="cell-num")[0].get_text(strip=True)
    power = None if power == "—" else int(power)

    # Extract accuracy
    accuracy = row.find_all("td", class_="cell-num")[1].get_text(strip=True)
    accuracy = None if accuracy == "—" else str(accuracy)

    # Extract PP
    pp = row.find_all("td", class_="cell-num")[2].get_text(strip=True)
    pp = int(pp) if pp.isdigit() else None

    # Extract effect
    effect = row.find("td", class_="cell-long-text")
    effect = effect.get_text(strip=True) if effect else None

    # Extract probability
    probability = row.find_all("td", class_="cell-num")[-1].get_text(strip=True)
    probability = None if probability == "—" else int(probability)

    # Append the extracted data
    moves.append({
        "Move Name": move_name,
        "Type": move_type,
        "Category": category,
        "Power": power,
        "Accuracy": accuracy,
        "PP": pp,
        "Effect": effect,
        "Probability (%)": probability
    })

In [9]:
# Convert to a DataFrame for easier handling
df_moves = pd.DataFrame(moves)

In [14]:
# Define the relative path to the data folder
relative_path = os.path.join("data", "pokemon_moves_data.csv")

In [15]:
# Save the DataFrame to the relative path
df_moves.to_csv(relative_path, index=False)

## Moves per Pokemon

### Test out the moves page for bulbasaur

In [3]:
bulbasaur_moves_url = "https://pokemondb.net/pokedex/bulbasaur"

In [None]:
try:
    driver = webdriver.Chrome(service=service)
    driver.get(bulbasaur_moves_url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
    
finally:
    # Close the browser
    driver.quit()

In [4]:
driver = webdriver.Chrome(service=service)
driver.get(bulbasaur_moves_url)

In [5]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [6]:
driver.quit()

In [10]:
# Find the table containing moves learnt by level up
table = soup.find('table', class_='data-table')

# Extract moves
moves_level_up = [row.find('td', class_='cell-name').get_text(strip=True)
         for row in table.find_all('tr')
         if row.find('td', class_='cell-name')]

In [12]:
moves_level_up

['Growl',
 'Tackle',
 'Vine Whip',
 'Growth',
 'Leech Seed',
 'Razor Leaf',
 'Poison Powder',
 'Sleep Powder',
 'Seed Bomb',
 'Take Down',
 'Sweet Scent',
 'Synthesis',
 'Worry Seed',
 'Power Whip',
 'Solar Beam']

In [16]:
# Extract moves from TM table
moves_tm = []
for row in soup.select('table.data-table tbody tr'):
    move_cell = row.select_one('.cell-name .ent-name')
    if move_cell:
        moves_tm.append(move_cell.text.strip())

In [17]:
moves_tm

['Growl',
 'Tackle',
 'Vine Whip',
 'Growth',
 'Leech Seed',
 'Razor Leaf',
 'Poison Powder',
 'Sleep Powder',
 'Seed Bomb',
 'Take Down',
 'Sweet Scent',
 'Synthesis',
 'Worry Seed',
 'Power Whip',
 'Solar Beam',
 'Curse',
 'Ingrain',
 'Petal Dance',
 'Toxic',
 'Take Down',
 'Charm',
 'Protect',
 'Acid Spray',
 'Trailblaze',
 'Facade',
 'Magical Leaf',
 'Venoshock',
 'Endure',
 'Sunny Day',
 'Bullet Seed',
 'False Swipe',
 'Body Slam',
 'Sleep Talk',
 'Seed Bomb',
 'Grass Knot',
 'Rest',
 'Swords Dance',
 'Substitute',
 'Giga Drain',
 'Energy Ball',
 'Helping Hand',
 'Grassy Terrain',
 'Grass Pledge',
 'Sludge Bomb',
 'Leaf Storm',
 'Solar Beam',
 'Tera Blast',
 'Toxic',
 'Knock Off',
 'Weather Ball',
 'Grassy Glide',
 'Double-Edge',
 'Curse',
 'Growl',
 'Tackle',
 'Vine Whip',
 'Growth',
 'Leech Seed',
 'Razor Leaf',
 'Poison Powder',
 'Sleep Powder',
 'Seed Bomb',
 'Take Down',
 'Sweet Scent',
 'Synthesis',
 'Worry Seed',
 'Double-Edge',
 'Solar Beam',
 'Amnesia',
 'Charm',
 'Curse'

In [21]:
# Find the <h2> tag related to moves learned
moves_header = soup.find('h2', string=lambda text: text and "Moves learned" in text)

# Navigate to the correct <ul> tag after the header
correct_list = moves_header.find_next('ul', class_='list-nav panel panel-nav')

# Extract the first generation
first_generation = correct_list.find('a').text.strip()

In [22]:
print(f"The first generation is: {first_generation}")

The first generation is: 1


### Test out pulling the first appeared gen for a random pokemon

In [29]:
# Try this strat for another random pokemon
intelon_url = "https://pokemondb.net/pokedex/inteleon"

In [30]:
driver = webdriver.Chrome(service=service)
driver.get(intelon_url)

In [31]:
soup_2 = BeautifulSoup(driver.page_source, 'html.parser')

In [32]:
driver.quit()

In [33]:
# Find the <h2> tag related to moves learned
moves_header = soup_2.find('h2', string=lambda text: text and "Moves learned" in text)

# Navigate to the correct <ul> tag after the header
correct_list = moves_header.find_next('ul', class_='list-nav panel panel-nav')

# Extract the first generation
first_generation = correct_list.find('a').text.strip()

In [34]:
print(f"The first generation is: {first_generation}")

The first generation is: 8
