## utils

In [9]:
import csv
import os
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
from enum import Enum
from typing import List, Optional


class EffectType(Enum):
    ATTACK = "attack"
    ABILITY = "ability"
    TRAINER = "trainer_effect"  # one effect, no name, no cost, no damage

class Effect:
    def __init__(
        self,
        effect_name: str,
        effect_type: Optional[EffectType] = None,
        effect_description: Optional[str] = None,
        effect_cost: Optional[List[str]] = None,
        effect_damage: int = 0
    ):
        self.effect_name = effect_name
        self.effect_type = effect_type
        self.effect_description = effect_description
        self.effect_cost = effect_cost
        self.effect_damage = effect_damage

    def __repr__(self):
        return (
            f"Effect(effect_name={self.effect_name!r}, "
            f"effect_type={self.effect_type.name if self.effect_type else None}, "
            f"effect_cost={self.effect_cost}, "
            f"effect_description={self.effect_description!r}, "
            f"effect_damage={self.effect_damage})"
        )
    def to_dict(self):
        return {
            "effect_name": self.effect_name,
            "effect_type": self.effect_type.value if self.effect_type else None,
            "effect_description": self.effect_description,
            "effect_cost": self.effect_cost,
            "effect_damage": self.effect_damage
        }
    
    @staticmethod
    def from_dict(data):
        return Effect(
            effect_name=data.get("effect_name"),
            effect_type=EffectType(data["effect_type"]) if data.get("effect_type") else None,
            effect_description=data.get("effect_description"),
            effect_cost=data.get("effect_cost"),
            effect_damage=data.get("effect_damage", 0)
        )


In [2]:
def get_existing_cards(filepath):
    result = set()
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                result.add(row["card_code"])
    except FileNotFoundError:
        pass  # File doesn't exist yet — no entries to check against    
    return result

In [3]:
def append_to_csvfile(filepath, row_to_append):
    with open(filepath, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=row_to_append.keys())
        if csvfile.tell() == 0:
            writer.writeheader()
        writer.writerow(row_to_append)

## get card links

In [None]:
def get_card_links(dex_url, card_links_filepath):

    # Navigate to dex table
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    response = requests.get(dex_url, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    scroll_wrapper = soup.select_one(
        "div.p-archiveBody__container > "
        "div.p-archiveBody__main > "
        "div.p-archiveContent__container > "
        "div.p-archiveContent__main > "
        "div.archive-style-wrapper > "
        "div.scroll--table.table-header--fixed"
    )
    table = scroll_wrapper.find("table")
    rows = table.find_all("tr")[1:]  # Skip header

    # Load existing card codes to avoid duplicates
    existing_codes = get_existing_cards(card_links_filepath)
    for row in rows:
        cols = row.find_all("td")[1:]
        if cols[1] and cols[1].div:
            card_code = cols[0].text.strip()
            if card_code in existing_codes:
                continue  # Skip duplicate entry
            
            card ={
                "card_code": cols[0].text.strip(),
                "card_name": cols[1].a.text.strip(),
                "url_detail": cols[1].a.get('href'),
                "url_img": cols[1].div.get('data-image-url')
            }
            append_to_csvfile(card_links_filepath, card)
            existing_codes.add(card_code)


get_card_links("https://game8.co/games/Pokemon-TCG-Pocket/archives/482685", "/project/data/card_links.csv")

## get card details from links

In [4]:
def scrape_card_details_from_url(card, card_url):
    # Find the table containing the card info and get columns
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    response = requests.get(card_url, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    scroll_wrapper = soup.select_one(
        "div.p-archiveBody__container > "
        "div.p-archiveBody__main > "
        "div.p-archiveContent__container > "
        "div.p-archiveContent__main > "
        "div.archive-style-wrapper"
    )
    cols = scroll_wrapper.find("h3", string=lambda text: text and "card info" in text.lower()).find_next("table").find_all(["tr"])

    # Save common field value: card_code, card_name, rating, pack, generation, illustrator
    card["rating"] = cols[1].img.get("alt").strip() if cols[1].img else None
    card["pack"] = cols[3].text.strip()
    card["generation"] = cols[5].text.strip()
    card["illustrator"] = cols[7].text.strip()

    # Save common field value: type
    cols9 = cols[9].find_all("td")
    cols9_val0 = cols9[0].text.strip().lower()
    cols9_val1 = [tag_a.img.get("alt").lower() for tag_a in cols9[1].find_all("a")]
    cols9_val2 = [tag_a.img.get("alt").lower() for tag_a in cols9[2].find_all("a")]
    card["type"] = cols9_val1

    # Parse Trainer - item, supporter, or Pokémon tool
    if card['type'][0].lower() in ("item", "supporter", "pokemon tool"):
        # Save field value: category, rarity
        card["category"] = cols9_val0
        card["rarity"] = cols9_val2
        # Save field value: effect
        card['effect'] = []
        trainer_effect_in_build = Effect("trainer_effect")
        trainer_effect_in_build.effect_type=EffectType.TRAINER
        trainer_effect_in_build.effect_description= cols[11].text.strip()
        card['effect'].append(trainer_effect_in_build)

    # Parse Pokémon
    else:
        # Save field value: stage, weakness
        card["stage"] = cols9_val0
        card["weakness"] = cols9_val2
        # Save field value: hp, retreat_cost, rarity
        cols11 = cols[11].find_all("td")
        cols11_val0 = int(cols11[0].text.strip().lower()) if cols11 and cols11[0].text.strip().isdigit() else -99
        cols11_val1 = [tag_a.img.get("alt").lower() for tag_a in cols11[1].find_all("a")]
        cols11_val2 = cols11[2].img.get("alt").lower()
        card["hp"] = cols11_val0
        card["retreat_cost"] = cols11_val1
        card["rarity"] = cols11_val2
        # Save field value: effect
        card["effect"] = []
        elems = scroll_wrapper.find("h3", string=lambda text: text and "moves and abilities" in text.lower()).find_next("table").find_all(["th", "td"])
        effect_in_process = None
        for elem in elems:
            if elem.name == "th":
                effect_name = elem.text.strip()
                effect_icon = [tag_a.img.get("alt").lower() for tag_a in elem.find_all("a")]
                effect_type = EffectType.ABILITY if "ability" in effect_icon else EffectType.ATTACK
                effect_cost = effect_icon if "ability" not in effect_icon else None
                if effect_in_process: card['effect'].append(effect_in_process)
                effect_in_process = Effect(effect_name=effect_name, effect_type=effect_type, effect_cost=effect_cost)
            elif elem.name == "td" and elem.b:
                for b in elem.find_all("b"):
                    label = b.text.strip().lower()
                    next_text = b.next_sibling.split(":")[1].strip() if b.next_sibling else ""
                    # print(f"Processing label: {label}, text: {next_text}")
                    if label == "damage": 
                        effect_in_process.effect_damage = int(next_text) if next_text.isdigit() else 0
                    elif label == "effect": 
                        effect_in_process.effect_description = next_text
            elif elem.name == "td" and not elem.b:
                next_text = elem.text.strip()
                if next_text:
                    if effect_in_process.effect_description:
                        effect_in_process.effect_description += " " + next_text
                    else:
                        effect_in_process.effect_description = next_text
        if effect_in_process:
            card['effect'].append(effect_in_process)
    return card

In [5]:
def get_card_details(card_links_filepath, card_details_pokemon_filepath, card_details_trainer_filepath, failed_links_filepath):
    # Load existing card codes to avoid duplicates
    existing_codes = get_existing_cards(card_details_pokemon_filepath)
    existing_codes.update(get_existing_cards(card_details_trainer_filepath))
    
    with open(card_links_filepath, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        # Iterate through each card link in the CSV file
        for row in reader:
            # Skip existing cards
            if(row['card_code'] in existing_codes):
                continue
            # SUCCESS - save card details
            try:
                card = {'card_code': row['card_code'],
                        'card_name': row['card_name']}
                scrape_card_details_from_url(card, row["url_detail"])
                effect_serilized = json.dumps([e.to_dict() for e in card["effect"]])
                card["effect"] = effect_serilized
                # Persist Trainer
                if card['type'][0].lower() in ("item", "supporter", "pokemon tool"):
                    append_to_csvfile(card_details_trainer_filepath, card)
                # Persist Pokemon
                else:
                    append_to_csvfile(card_details_pokemon_filepath, card)
            # FAILURE - save card links for investigation
            except Exception as e:
                print(f"Error processing card {row['card_name']}: {e}")
                card = {'card_code': row['card_code'],
                        'card_name': row['card_name'],
                        'url_detail': row['url_detail']
                       }
                append_to_csvfile(failed_links_filepath, card)
            # Include card in the parsed pool
            existing_codes.add(row["card_code"])

get_card_details("/project/data/card_links.csv", 
                 "/project/data/card_details_pokemon.csv", 
                 "/project/data/card_details_trainer.csv", 
                 "/project/data/card_details_failed.csv")


Error processing card Floatzel: 'NoneType' object has no attribute 'find_next'
Error processing card Ekans: 'NoneType' object has no attribute 'find_next'


## get card images from links

In [13]:
def get_card_images(card_links_filepath, image_folderpath, failed_links_filepath):

    # Load existing card codes to avoid duplicates
    existing_codes = set()
    pattern = re.compile(r"^(.+?)_.*?_490x683\.png$")
    for filename in os.listdir(image_folderpath):
        match = pattern.match(filename)
        if match:
            card_code = match.group(1)
            existing_codes.add(card_code)
        
    # Iterate through image links
    with open(card_links_filepath, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            card_code = row["card_code"]
            card_name = row["card_name"]
            url_img = row["url_img"]
            # Skip downloaded card
            if(card_code in existing_codes):
                continue

            # SUCCESS - save image
            try:
                response = requests.get(url_img)
                response.raise_for_status()
                image_path = f"{image_folderpath}/{card_code}_{card_name.lower()}_490x683.png"
                with open(image_path, "wb") as img_file:
                    img_file.write(response.content)

            # FAILURE - save link for further investigation
            except Exception as e:
                print(f"Failed to download image for card {card_code}: {e}")
                card = {'card_name': card_name, 
                        'card_code': card_code, 
                        'url_img': url_img
                       }
                append_to_csvfile(failed_links_filepath, card)
            # Include current card in the downloaded card
            existing_codes.add(card_code)

os.makedirs("/project/data/image", exist_ok=True)
get_card_images("/project/data/card_links.csv", 
                "/project/data/image", 
                "/project/data/card_image_failed.csv")

## scratch

In [None]:
import webbrowser
from IPython.display import display, HTML

index_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/482685"
item_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476271"
supporter_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476272"
pokemon_tool_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/496832"
pokemon_ability_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476021"
pokemon_two_attack_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476048"
pokemon_descrip_damage_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476039"
pokemon_promo_url1 = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476295"
pokemon_promo_url2 = "https://game8.co/games/Pokemon-TCG-Pocket/archives/519246"

urls = [
    index_url,
    item_url,
    supporter_url,
    pokemon_tool_url,
    pokemon_ability_url,
    pokemon_two_attack_url,
    pokemon_descrip_damage_url,
    pokemon_promo_url1,
    pokemon_promo_url2
]

for url in urls:
    display(HTML(f'<a href="{url}" target="_blank">{url}</a>'))