## utils

In [7]:
import csv
import os
import requests
from bs4 import BeautifulSoup
from enum import Enum
from typing import List, Optional


class EffectType(Enum):
    ATTACK = "attack"
    ABILITY = "ability"
    TRAINER = "trainer_effect"  # one effect, no name, no cost, no damage

class Effect:
    def __init__(
        self,
        effect_name: str,
        effect_type: Optional[EffectType] = None,
        effect_description: Optional[str] = None,
        effect_cost: Optional[List[str]] = None,
        effect_damage: int = 0
    ):
        self.effect_name = effect_name
        self.effect_type = effect_type
        self.effect_description = effect_description
        self.effect_cost = effect_cost
        self.effect_damage = effect_damage

    def __repr__(self):
        return (
            f"Effect(effect_name={self.effect_name!r}, "
            f"effect_type={self.effect_type.name if self.effect_type else None}, "
            f"effect_cost={self.effect_cost}, "
            f"effect_description={self.effect_description!r}, "
            f"effect_damage={self.effect_damage})"
        )


## get card links

In [11]:
def get_card_links(dex_url, card_links_file_path):
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    response = requests.get(dex_url, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    scroll_wrapper = soup.select_one(
        "div.p-archiveBody__container > "
        "div.p-archiveBody__main > "
        "div.p-archiveContent__container > "
        "div.p-archiveContent__main > "
        "div.archive-style-wrapper > "
        "div.scroll--table.table-header--fixed"
    )
    table = scroll_wrapper.find("table")
    rows = table.find_all("tr")[1:]  # Skip header

    # Load existing card codes to avoid duplicates
    existing_codes = set()
    try:
        with open(card_links_file_path, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_codes.add(row["card_code"])
    except FileNotFoundError:
        pass  # File doesn't exist yet — no entries to check against


    for row in rows:
        cols = row.find_all("td")[1:]
        if cols[1] and cols[1].div:
            card_code = cols[0].text.strip()
            if card_code in existing_codes:
                continue  # Skip duplicate entry
            
            card ={
                "card_code": cols[0].text.strip(),
                "card_name": cols[1].a.text.strip(),
                "url_detail": cols[1].a.get('href'),
                "url_img": cols[1].div.get('data-image-url')
            }
            with open(card_links_file_path, "a", newline="", encoding="utf-8") as csvfile:
                fieldnames = ["card_code", "card_name", "url_detail", "url_img"]
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                if csvfile.tell() == 0:
                    writer.writeheader()
                writer.writerow(card)
                existing_codes.add(card_code)


get_card_links("https://game8.co/games/Pokemon-TCG-Pocket/archives/482685", "/project/data/card_links_test.csv")

ConnectionError: HTTPSConnectionPool(host='game8.co', port=443): Max retries exceeded with url: /games/Pokemon-TCG-Pocket/archives/482685 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0xffff51674e90>: Failed to resolve 'game8.co' ([Errno -3] Temporary failure in name resolution)"))

## get card details from links

In [None]:
def get_card_details(card_links_file_path, success_file_path, failed_file_path):
    with open(card_links_file_path, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        # Iterate through each card link in the CSV file
        for row in reader:
            # Save the successfully scraped card details
            try:
                card = {
                    'card_code': row['card_code'],
                    'card_name': row['card_name']
                }
                scrape_card_details_from_url(card, row['url_detail'])
                with open(success_file_path, "a", newline="", encoding="utf-8") as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=card.keys())
                    if csvfile.tell() == 0:
                        writer.writeheader()
                    writer.writerow(card)
            
            # Save the failed card details to a separate file
            except Exception as e:
                print(f"Error processing card {row['card_name']}: {e}")
                with open(failed_file_path, "a", encoding="utf-8") as failed_csvfile:
                    writer = csv.DictWriter(failed_csvfile, fieldnames=['card_code', 'card_name', 'url_detail'])
                    if failed_csvfile.tell() == 0:
                        writer.writeheader()
                    writer.writerow({
                        'card_code': row['card_code'],
                        'card_name': row['card_name'],
                        'url_detail': row['url_detail']
                    })



def scrape_card_details_from_url(card, card_url):
    try: 
        HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
        response = requests.get(card_url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract card details
        scroll_wrapper = soup.select_one(
            "div.p-archiveBody__container > "
            "div.p-archiveBody__main > "
            "div.p-archiveContent__container > "
            "div.p-archiveContent__main > "
            "div.archive-style-wrapper"
        )

        # Find the table containing the card info
        cols = scroll_wrapper.find("h3", string=lambda text: text and "card info" in text.lower()).find_next("table").find_all(["tr"])

        attr1_name = cols[0].text.strip().lower()
        attr1_value = cols[1].img.get("alt").strip() if cols[1].img else None
        attr2_name = cols[2].text.strip().lower()
        attr2_value = cols[3].text.strip()
        attr3_name = cols[4].text.strip().lower()
        attr3_value = cols[5].text.strip()
        attr4_name = cols[6].text.strip().lower()
        attr4_value = cols[7].text.strip()

        attr5_name, attr6_name, attr7_name = [th.text.lower() for th in cols[8].find_all("th")]
        cols9 = cols[9].find_all("td")
        attr5_value = cols9[0].text.strip().lower()
        attr6_value = [tag_a.img.get("alt").lower() for tag_a in cols9[1].find_all("a")]
        attr7_value = [tag_a.img.get("alt").lower() for tag_a in cols9[2].find_all("a")]

        card[attr1_name] = attr1_value
        card[attr2_name] = attr2_value
        card[attr3_name] = attr3_value
        card[attr4_name] = attr4_value
        card[attr5_name] = attr5_value
        card[attr6_name] = attr6_value
        card[attr7_name] = attr7_value
        card['effect'] = []

        if card['type'][0].lower() in ("item", "supporter", "pokemon tool"):
            # Handle item, supporter, or Pokémon tool
            # print(f"Handling {card['type']} card type")
            trainer_effect_in_build = Effect("trainer_effect")
            trainer_effect_in_build.effect_type=EffectType.TRAINER
            trainer_effect_in_build.effect_description= cols[11].text.strip()
            card['effect'].append(trainer_effect_in_build)

        else:
            # Handle Pokémon
            ## Extract additional attributes for Pokémon cards
            attr8_name, attr9_name, attr10_name = [th.text.lower() for th in cols[10].find_all("th")]
            cols11 = cols[11].find_all("td")
            attr8_value = int(cols11[0].text.strip().lower()) if cols11 and cols11[0].text.strip().isdigit() else -99
            attr9_value = [tag_a.img.get("alt").lower() for tag_a in cols11[1].find_all("a")]
            attr10_value = cols11[2].img.get("alt").lower()

            card[attr8_name] = attr8_value
            card[attr9_name] = attr9_value
            card[attr10_name] = attr10_value
            ## Extract effects for Pokémon cards
            elems = scroll_wrapper.find("h3", string=lambda text: text and "moves and abilities" in text.lower()).find_next("table").find_all(["th", "td"])
            effect_in_process = None
            for elem in elems:
                if elem.name == "th":
                    effect_name = elem.text.strip()
                    effect_icon = [tag_a.img.get("alt").lower() for tag_a in elem.find_all("a")]
                    effect_type = EffectType.ABILITY if "ability" in effect_icon else EffectType.ATTACK
                    effect_cost = effect_icon if "ability" not in effect_icon else None
                    if effect_in_process: card['effect'].append(effect_in_process)
                    effect_in_process = Effect(effect_name=effect_name, effect_type=effect_type, effect_cost=effect_cost)
                elif elem.name == "td" and elem.b:
                    for b in elem.find_all("b"):
                        label = b.text.strip().lower()
                        next_text = b.next_sibling.split(":")[1].strip() if b.next_sibling else ""
                        # print(f"Processing label: {label}, text: {next_text}")
                        if label == "damage": 
                            effect_in_process.effect_damage = int(next_text) if next_text.isdigit() else 0
                        elif label == "effect": 
                            effect_in_process.effect_description = next_text
                elif elem.name == "td" and not elem.b:
                    next_text = elem.text.strip()
                    if next_text:
                        if effect_in_process.effect_description:
                            effect_in_process.effect_description += " " + next_text
                        else:
                            effect_in_process.effect_description = next_text
            if effect_in_process:
                card['effect'].append(effect_in_process)
    except Exception as e:
        raise RuntimeError(f"Error scraping card details from {card_url}: {e}")

    # 'isEx': False,        # Not in DOM snippet
    # 'isShiny': False      # Not in DOM snippet

get_card_details("data/card_links.csv", "data/card_details.csv", "data/card_details_failed.csv")


## get card images from links

In [None]:
def get_card_images(card_links_file_path, image_folder_path, failed_file_path):
    with open(card_links_file_path, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            card_code = row['card_code']
            card_name = row['card_name']
            image_url = row['url_img']
            if not image_url:
                print(f"No image URL for card {card_code}, skipping.")
                continue
            
            try:
                response = requests.get(image_url)
                response.raise_for_status()
                
                # Save the image
                image_path = f"{image_folder_path}/{card_code}_{card_name.lower()}_490x683.png"
                with open(image_path, "wb") as img_file:
                    img_file.write(response.content)

            except Exception as e:
                print(f"Failed to download image for card {card_code}: {e}")
                with open(failed_file_path, "a", encoding="utf-8") as failed_csvfile:
                    writer = csv.DictWriter(failed_csvfile, fieldnames=['card_name', 'card_code', 'url_img'])
                    if failed_csvfile.tell() == 0:
                        writer.writeheader()
                    writer.writerow({'card_name': card_name, 'card_code': card_code, 'url_img': image_url})

os.makedirs("images", exist_ok=True)
get_card_images("data/card_links.csv", "images", "data/card_images_failed.csv")

## scratch

In [None]:
# item_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476271"
# supporter_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476272"
# pokemon_tool_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/496832"
# pokemon_ability_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476021"
# pokemon_two_attack_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476048"
# pokemon_descrip_damage_url = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476039"
# pokemon_promo_url1 = "https://game8.co/games/Pokemon-TCG-Pocket/archives/476295"
# pokemon_promo_url2 = "https://game8.co/games/Pokemon-TCG-Pocket/archives/519246"
# card={}

# HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
# response = requests.get(pokemon_two_attack_url, headers=HEADERS)
# response.raise_for_status()
# soup = BeautifulSoup(response.text, "html.parser")

# # Extract card details
# scroll_wrapper = soup.select_one(
#     "div.p-archiveBody__container > "
#     "div.p-archiveBody__main > "
#     "div.p-archiveContent__container > "
#     "div.p-archiveContent__main > "
#     "div.archive-style-wrapper"
# )

# # Find the table containing the card info
# cols = scroll_wrapper.find("h3", string=lambda text: text and "card info" in text.lower()).find_next("table").find_all(["tr"])

# attr1_name = cols[0].text.strip().lower()
# attr1_value = cols[1].img.get("alt").strip() if cols[1].img else None
# attr2_name = cols[2].text.strip().lower()
# attr2_value = cols[3].text.strip()
# attr3_name = cols[4].text.strip().lower()
# attr3_value = cols[5].text.strip()
# attr4_name = cols[6].text.strip().lower()
# attr4_value = cols[7].text.strip()

# attr5_name, attr6_name, attr7_name = [th.text.lower() for th in cols[8].find_all("th")]
# cols9 = cols[9].find_all("td")
# attr5_value = cols9[0].text.strip().lower()
# attr6_value = [tag_a.img.get("alt").lower() for tag_a in cols9[1].find_all("a")]
# attr7_value = [tag_a.img.get("alt").lower() for tag_a in cols9[2].find_all("a")]


# card[attr1_name] = attr1_value
# card[attr2_name] = attr2_value
# card[attr3_name] = attr3_value
# card[attr4_name] = attr4_value
# card[attr5_name] = attr5_value
# card[attr6_name] = attr6_value
# card[attr7_name] = attr7_value
# card['effect'] = []

# if card['type'][0].lower() in ("item", "supporter", "pokemon tool"):
#     # Handle item, supporter, or Pokémon tool
#     print(f"Handling {card['type']} card type")
#     trainer_effect_in_build = Effect("trainer_effect")
#     trainer_effect_in_build.effect_type=EffectType.TRAINER
#     trainer_effect_in_build.effect_description= cols[11].text.strip()
#     card['effect'].append(trainer_effect_in_build)

# else:
#     # Handle Pokémon
#     ## Extract additional attributes for Pokémon cards
#     attr8_name, attr9_name, attr10_name = [th.text.lower() for th in cols[10].find_all("th")]
#     cols11 = cols[11].find_all("td")
#     attr8_value = int(cols11[0].text.strip().lower()) if cols11 and cols11[0].text.strip().isdigit() else -99
#     attr9_value = [tag_a.img.get("alt").lower() for tag_a in cols11[1].find_all("a")]
#     attr10_value = cols11[2].img.get("alt").lower()

#     card[attr8_name] = attr8_value
#     card[attr9_name] = attr9_value
#     card[attr10_name] = attr10_value
#     ## Extract effects for Pokémon cards
#     elems = scroll_wrapper.find("h3", string=lambda text: text and "moves and abilities" in text.lower()).find_next("table").find_all(["th", "td"])
#     effect_in_process = None
#     for elem in elems:
#         if elem.name == "th":
#             effect_name = elem.text.strip()
#             effect_icon = [tag_a.img.get("alt").lower() for tag_a in elem.find_all("a")]
#             effect_type = EffectType.ABILITY if "ability" in effect_icon else EffectType.ATTACK
#             effect_cost = effect_icon if "ability" not in effect_icon else None
#             if effect_in_process: card['effect'].append(effect_in_process)
#             effect_in_process = Effect(effect_name=effect_name, effect_type=effect_type, effect_cost=effect_cost)
#         elif elem.name == "td" and elem.b:
#             for b in elem.find_all("b"):
#                 label = b.text.strip().lower()
#                 next_text = b.next_sibling.split(":")[1].strip() if b.next_sibling else ""
#                 # print(f"Processing label: {label}, text: {next_text}")
#                 if label == "damage": 
#                     effect_in_process.effect_damage = int(next_text) if next_text.isdigit() else 0
#                 elif label == "effect": 
#                     effect_in_process.effect_description = next_text
#         elif elem.name == "td" and not elem.b:
#             next_text = elem.text.strip()
#             if next_text:
#                 if effect_in_process.effect_description:
#                     effect_in_process.effect_description += " " + next_text
#                 else:
#                     effect_in_process.effect_description = next_text
#     if effect_in_process:
#         card['effect'].append(effect_in_process)
    

# # for effect in card['effect']:
# #     print("*******************")
# #     print(effect)

