# Scrape Mangareader

## Set-up for a single page
### Selenium to access JS components

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import requests
import time

In [6]:
# Find the number of chapters in the manga "Dorohedoro" 
# setup webdriver
driver = webdriver.Chrome()

# navigate to the webpage
driver.get('https://mangareader.to/read/dorohedoro-57/en/chapter-1')

# Find the button with the text "Reject All" and click it using JavaScript
button = driver.find_element(By.XPATH, "//span[text()='Reject All']")
driver.execute_script("arguments[0].click();", button)

# Find the element with the class "dropdown-item mode-item" and data-value "horizontal" and click it using JavaScript
element = driver.find_element(By.XPATH, "//a[@class='dropdown-item mode-item' and @data-value='horizontal']")
driver.execute_script("arguments[0].click();", element)

# make the driver wait for 5 seconds
time.sleep(5)

# use BeautifulSoup to parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

# find the first li element with the class "item reading-item chapter-item"
li_element = soup.find('li', class_='item reading-item chapter-item')

# extract the data-number attribute and convert it to an integer
data_number = float(li_element['data-number'])

In [7]:
# Loop through all the chapters
for chapter in range(1, 2):
    driver.get(f"https://mangareader.to/read/dorohedoro-57/en/chapter-{chapter}")

    # Update the soup object with the new page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all div elements with class "ds-image shuffled loaded" and "ds-image shuffled"
    divs = soup.find_all('div', class_=['ds-image shuffled loaded', 'ds-image shuffled'])

    # If no divs were found, try finding divs with class "ds-image loaded"
    if not divs:
        divs = soup.find_all('div', class_='ds-image loaded')

    # Check if any data URLs were found
    if not divs:
        print(f"Chapter {chapter}: No images found")
        continue  # Skip to the next chapter

    # Extract data-url from each div and store in a list
    data_urls = [div.get('data-url') for div in divs]

    # Download images
    images = []
    for url in data_urls:
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            images.append(img)
        except requests.exceptions.RequestException as e:
            print(f"Chapter {chapter}: Error downloading image: {e}")

    # Check if any images were downloaded
    if not images:
        print(f"Chapter {chapter}: Failed to download any images")
        continue  # Skip to the next chapter

    # Convert images to JPEG format
    images = [img.convert('RGB') for img in images]

    os.makedirs(f"Chapter_{chapter}", exist_ok=True)

    # Save images in the chapter subfolder  
    for i, img in enumerate(images):
        img.save(f"Chapter_{chapter}/Image_{i+1}.jpeg")

    print(f"Chapter {chapter}: Downloaded {len(images)} images")

Chapter 1: Downloaded 24 images


In [11]:
def divide_image(image_path, square_size):
    with Image.open(image_path) as img:
        width, height = img.size

        for i in range(0, width, square_size):
            for j in range(0, height, square_size):
                box = (i, j, i+square_size, j+square_size)
                cropped_img = img.crop(box)
                cropped_img.save(f"{os.path.splitext(image_path)[0]}_{i}_{j}.jpeg")

# Usage
divide_image("\OLD\Chapter_1\Image_1.jpeg", 200)

  divide_image("\OLD\Chapter_1\Image_1.jpeg", 200)
  divide_image("\OLD\Chapter_1\Image_1.jpeg", 200)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\OLD\\Chapter_1\\Image_1.jpeg'

## Find bonus chapters

In [8]:
valid_urls = []  # Create an empty list to store valid URLs

for chapter in range(1, int(data_number) + 2):

    # Check for a decimal chapter
    bonus_chapter = chapter + 0.5
    bonus_url = f"https://mangareader.to/read/dorohedoro-57/en/chapter-{bonus_chapter}"

    response = requests.get(bonus_url)

    if response.status_code == 200:
        valid_urls.append(bonus_url)