# Scrape Mangareader

## Set-up for a single page
### Selenium to access JS components

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import requests
import time

In [5]:
# Find the number of chapters in the manga "Vagabond Colored Edition"
# setup webdriver
driver = webdriver.Chrome()

# navigate to the webpage
driver.get('https://mangareader.to/read/vagabond-colored-edition-55963/en/chapter-1')

# Find the button with the text "Reject All" and click it using JavaScript
button = driver.find_element(By.XPATH, "//span[text()='Reject All']")
driver.execute_script("arguments[0].click();", button)

# Find the element with the class "dropdown-item mode-item" and data-value "horizontal" and click it using JavaScript
element = driver.find_element(By.XPATH, "//a[@class='dropdown-item mode-item' and @data-value='horizontal']")
driver.execute_script("arguments[0].click();", element)

# make the driver wait for 5 seconds
time.sleep(5)

# use BeautifulSoup to parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

# find the first li element with the class "item reading-item chapter-item"
li_element = soup.find('li', class_='item reading-item chapter-item')

# extract the data-number attribute
data_number = int(li_element['data-number'])

In [6]:
# Loop through all the chapters
for chapter in range(1, data_number + 1):
    # If it is not chapter 1, reload the driver
    if chapter != 1:
        driver.get(f"https://mangareader.to/read/vagabond-colored-edition-55963/en/chapter-{chapter}")
    
    # Update the soup object with the new page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all div elements with class "ds-image shuffled loaded" and "ds-image shuffled"
    divs = soup.find_all('div', class_=['ds-image shuffled loaded', 'ds-image shuffled'])

    # If no divs were found, try finding divs with class "ds-image loaded"
    if not divs:
        divs = soup.find_all('div', class_='ds-image loaded')

    # Check if any data URLs were found
    if not divs:
        print(f"Chapter {chapter}: No images found")
        continue  # Skip to the next chapter

    # Extract data-url from each div and store in a list
    data_urls = [div.get('data-url') for div in divs]

    # Download images
    images = []
    for url in data_urls:
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            images.append(img)
        except requests.exceptions.RequestException as e:
            print(f"Chapter {chapter}: Error downloading image: {e}")

    # Check if any images were downloaded
    if not images:
        print(f"Chapter {chapter}: Failed to download any images")
        continue  # Skip to the next chapter

    # Save images as PDF
    images[0].save(f"vagabond_{chapter}.pdf", save_all=True, append_images=images[1:])

In [7]:
# Close the driver
driver.close()