### Preamble (required)

In [41]:
from bs4 import BeautifulSoup
import requests
from IPython.display import display, Image
import tempfile
import os
import json
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv("../.env")

DUMP_DIR = "./out"
OUTPUT_DIR = "./txt"

### Create scraper with sanity test

In [None]:
### TODO: scrape every event shop.
###       only get the ones with a vaible shop (debug search for keyword item shop to check for compatability)
###       Put bad entries in a list txt file and skip in the future.
###       Serverless function? Python (reality: just rewrite in js)
# https://azurlane.koumakan.jp/wiki/Events#Limited_Event_List


def al_scraper(url, show=True):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    en_start_date = None
    en_end_date = None
    all_start_date = None
    all_end_date = None

    # Find the EN Servers row and extract dates
    event_table = soup.find("table", class_="event-infobox")
    if event_table:
        rows = event_table.find_all("tr")
        for i, row in enumerate(rows):
            th = row.find("th")
            if th and "en server" in th.text.lower():
                if i + 2 < len(rows):
                    en_start_date = rows[i + 1].find_all("td")[1].text.strip()
                    en_end_date = rows[i + 2].find_all("td")[1].text.strip()
                    if show:
                        print("EN Servers Date")
                        print(f"Event period: {en_start_date} - {en_end_date}")
            elif th and "all server" in th.text.lower():
                if i + 2 < len(rows):
                    all_start_date = rows[i + 1].find_all("td")[1].text.strip()
                    all_end_date = rows[i + 2].find_all("td")[1].text.strip()
                    if show:
                        print("All Servers Date")
                        print(f"Event period: {all_start_date} - {all_end_date}")

    # Find all shop items
    shop_items = soup.find_all("div", class_="item-frame")

    # Extract and print details for each item
    # Get total number of shop items
    total_items = len(shop_items)

    if not shop_items or total_items <= 0:
        return False

    if show:
        print(f"Total item (entries) in shop: {total_items}")
    for item in shop_items:
        stock = (
            item.find("div", class_="item-stock")
            .text.strip()
            .replace("Available:", "")
            .strip()
        )
        name = item.find("div", class_="item-name").text.strip()
        price = item.find("div", class_="item-price").text.strip()
        if show:
            print(f"Item: '{name}'")
            print(f"Stock: '{stock}'")
            print(f"Price: '{price}'")
        image_url = item.find("div", class_="item-image-frame").find("img")["src"]
        if not image_url.startswith("http"):
            image_url = "https://azurlane.koumakan.jp" + image_url
        # print(f"Image URL: {image_url}")

        if show:
            img = download_image(image_url, name)
            display(Image(img))
            print("--")

        # Create dictionary to store event info
        event_data = {"dates": {}, "shop_items": []}

        # Store dates
        if en_start_date:
            event_data["dates"]["en_server"] = {
                "start": en_start_date,
                "end": en_end_date,
            }
        if all_start_date:
            event_data["dates"]["all_server"] = {
                "start": all_start_date,
                "end": all_end_date,
            }

        # Store shop items
        for item in shop_items:
            item_data = {
                "name": item.find("div", class_="item-name").text.strip(),
                "stock": item.find("div", class_="item-stock")
                .text.strip()
                .replace("Available:", "")
                .strip(),
                "price": item.find("div", class_="item-price").text.strip(),
                "image_url": item.find("div", class_="item-image-frame").find("img")[
                    "src"
                ],
            }
            if not item_data["image_url"].startswith("http"):
                item_data["image_url"] = (
                    "https://azurlane.koumakan.jp" + item_data["image_url"]
                )
            event_data["shop_items"].append(item_data)

        return event_data


def download_image(url, name):
    temp_dir = tempfile.gettempdir()
    temp_path = os.path.join(temp_dir, f"{name}.jpg")
    with open(temp_path, "wb") as f:
        f.write(requests.get(url).content)
    return temp_path


def picsum_demo():
    temp_dir = tempfile.gettempdir()
    # Download a random image from Lorem Picsum as comparison
    lorem_url = "https://picsum.photos/200/300"
    lorem_path = os.path.join(temp_dir, "lorem_comparison.jpg")
    with open(lorem_path, "wb") as f:
        f.write(requests.get(lorem_url).content)
    print(f"Lorem Picsum comparison saved to: {lorem_path}")
    display(Image(lorem_path))


res = al_scraper(url="https://azurlane.koumakan.jp/wiki/Dangerous_Inventons_Incoming!")
# if res:
#     print(res)
# al_scraper(url = 'https://azurlane.koumakan.jp/wiki/Violet_Tempest,_Blooming_Lycoris')
# al_scraper(url = 'https://azurlane.koumakan.jp/wiki/Visitors_Dyed_in_Red_Rerun')

### Look for new URLs and parse existsing ones (concurrent)

In [5]:
# Get all event URLs from the events page
event_list_url = "https://azurlane.koumakan.jp/wiki/Events#Limited_Event_List"
response = requests.get(event_list_url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the Limited Event List section and get all table headers with links
event_table = (
    soup.find("span", {"id": "Limited_Event_List"}).find_parent("h2").find_next("table")
)
event_links = event_table.find_all("th")

# Open a file to store events without shops
# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

shopless_events = open(f"{OUTPUT_DIR}/shopless_event_urls.txt", "a+")
shop_events = open(f"{OUTPUT_DIR}/shop_event_urls.txt", "a+")

# Read existing URLs and remove duplicates
existing_urls = set(shopless_events.readlines())
shopless_events.seek(0)
shopless_events.truncate()
shopless_events.writelines(sorted(set(existing_urls)))


from concurrent.futures import ThreadPoolExecutor


def process_event(link):
    if link and link.get("href"):
        event_url = "https://azurlane.koumakan.jp" + link["href"]
        if event_url in existing_urls:
            return
        print(f"Event URL: {event_url}")
        ret = al_scraper(event_url, show=False)
        if not ret:
            # print("\tNo items found in shop.")
            shopless_events.write(event_url + "\n")
        else:
            shop_events.write(event_url + "\n")


with ThreadPoolExecutor(max_workers=5) as executor:
    for header in event_links:
        a_links = header.find_all("a")
        executor.map(process_event, a_links)

### Process Good URLs into JSON dump

In [38]:
# Read shop event URLs and scrape each one
with open(f"{OUTPUT_DIR}/shop_event_urls.txt", "r") as f:
    shop_urls = f.readlines()

if not shop_urls:
    raise ValueError("No shop URLs found in shop_event_urls.txt")

if not os.path.exists(DUMP_DIR):
    os.makedirs(DUMP_DIR)

db_tmp = open(f"{DUMP_DIR}/soon-to-be-db.json", "w")

# Clean URLs by removing whitespace/newlines
shop_urls_clean = [url.strip() for url in shop_urls]

# # use exceptions to debug
# for url, clean_url in zip(shop_urls, shop_urls_clean):
#     if url != clean_url:
#         raise ValueError(f"URL mismatch after cleaning:\nOriginal: {url!r}\nCleaned: {clean_url!r}")
# exit()

# Process each URL
db_j = {}


# def process_shop_url(url):
#     print(f"Processing {url}")
#     res = al_scraper(url=url, show=False)
#     return url, res


# with ThreadPoolExecutor(max_workers=5) as executor:
#     # Submit all tasks and get futures
#     futures = [executor.submit(process_shop_url, url) for url in shop_urls_clean[:10]]

#     # Process results as they complete
#     for future in futures:
#         url, res = future.result()
#         print(f"Done: {url}")
#         db_j[url] = res

# Process URLs sequentially
num_items = 10  # Set to None for all items, or a number for limited processing
ni_string = "All"
if num_items:
    ni_string = num_items
print(f"Processing {ni_string} of {len(shop_urls_clean)} URLs")

for url in shop_urls_clean[:num_items]:
    print(f"Processing {url}")
    res = al_scraper(url=url, show=False)
    db_j[url] = res

print("done!")
json.dump(db_j, db_tmp)

Processing 90 URLs
Processing https://azurlane.koumakan.jp/wiki/Divergent_Chessboard
Processing https://azurlane.koumakan.jp/wiki/Visitors_Dyed_in_Red_Rerun
Processing https://azurlane.koumakan.jp/wiki/Opposite-Colored_Rerun
Processing https://azurlane.koumakan.jp/wiki/Ink-Stained_Steel_Sakura
Processing https://azurlane.koumakan.jp/wiki/Iris_of_Light_and_Dark
Processing https://azurlane.koumakan.jp/wiki/Crimson_Echoes
Processing https://azurlane.koumakan.jp/wiki/Hibiscus-scented_Idol
Processing https://azurlane.koumakan.jp/wiki/Ink-Stained_Steel_Sakura_Rerun
Processing https://azurlane.koumakan.jp/wiki/Scherzo_of_Iron_and_Blood
Processing https://azurlane.koumakan.jp/wiki/Scherzo_of_Iron_and_Blood#Science_Rules.21
done!


### MongoDB Santiy Check

In [None]:
# Read and parse the JSON file
with open(f"{DUMP_DIR}/soon-to-be-db.json", "r") as f:
    db_data = json.load(f)

# Print basic stats
print(f"Number of events scraped: {len(db_data)}")

# Print data for all events
for event_url, event_data in db_data.items():
    print("\nEvent data:")
    print(f"URL: {event_url}")
    print(f"Number of shop items: {len(event_data['shop_items'])}")
    if "dates" in event_data:
        print("Event dates:")
        for server, dates in event_data["dates"].items():
            print(f"  {server}: {dates}")

### Upload new data to mongodb collection (name set, ignores env table)

In [42]:
load_dotenv("../.env")
db_url = os.getenv("DB_URL")
client = MongoClient(db_url)

# Test connection
try:
    client.admin.command("ping")
    print("Successfully connected to MongoDB!")
    with open(f"{DUMP_DIR}/soon-to-be-db.json", "r") as f:
        db_data = json.load(f)

    # Get the database and collection
    db = client["azurlane"]
    events_collection = db["events"]

    # Convert the data into a list of documents
    documents = []
    for url, event_data in db_data.items():
        doc = event_data.copy()
        doc["url"] = url  # Add URL as a field in the document
        documents.append(doc)

    # Insert the documents
    if documents:
        result = events_collection.insert_many(documents)
        print(
            f"Successfully inserted {len(result.inserted_ids)} documents into MongoDB"
        )
    else:
        print("No documents to insert")
except Exception as e:
    print(f"Failed to connect to MongoDB: {e}")

Successfully connected to MongoDB!
Successfully inserted 63 documents into MongoDB


### Sanity check read (specify url)

In [49]:
# Query for specific event by URL
event_url = "https://azurlane.koumakan.jp/wiki/Opposite-Colored_Rerun"
event = events_collection.find_one({"url": event_url})

if event:
    print(f"\nFound event data for {event_url}:")
    print(json.dumps(event, indent=4, default=str))
else:
    print(f"No event found with URL: {event_url}")


Found event data for https://azurlane.koumakan.jp/wiki/Opposite-Colored_Rerun:
{
    "_id": "674886835a170b03e520d00a",
    "dates": {
        "en_server": {
            "start": "October 31st, 2019",
            "end": "November 13th, 2019"
        }
    },
    "shop_items": [
        {
            "name": "Tirpitz",
            "stock": "1",
            "price": "8000",
            "image_url": "https://azurlane.netojuu.com/images/thumb/5/5e/TirpitzIcon.png/96px-TirpitzIcon.png"
        },
        {
            "name": "Gneisenau",
            "stock": "1",
            "price": "4000",
            "image_url": "https://azurlane.netojuu.com/images/thumb/4/49/GneisenauIcon.png/96px-GneisenauIcon.png"
        },
        {
            "name": "533mm Quintuple Homing Torpedo Mount",
            "stock": "1",
            "price": "6000",
            "image_url": "https://azurlane.netojuu.com/images/thumb/7/7b/45200.png/96px-45200.png"
        },
        {
            "name": "Twin 380mm (