# EXTRAÇÃO DE MANGA PELO SITE MANGALIVRE

## Setup

### Dependencias do SO
É necessario fazer a instalação das seguintes dependencias do SO Linux (Ubuntu):
1) Instale o Google Chrome
```sh
sudo apt-get install -y curl unzip xvfb libxi6 libgconf-2-4 && \
    wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
    sudo apt install ./google-chrome-stable_current_amd64.deb -y && \
    google-chrome --version
```
2) Instale o Chromedriver baseado na versão no Google-Chrome. Nesse momento a versão é a `108.0.5359.71`:
```sh
wget https://chromedriver.storage.googleapis.com/108.0.5359.71/chromedriver_linux64.zip && \
    unzip chromedriver_linux64.zip && \
    sudo mv chromedriver /usr/bin/chromedriver && \
    sudo chown root:root /usr/bin/chromedriver && \
    sudo chmod +x /usr/bin/chromedriver
```
> - [https://chromedriver.chromium.org/](https://chromedriver.chromium.org/)

### Dependencias do Python
```sh
poetry add pydash selenium-wire
```

## WebDriver + MangaLivre

In [None]:
from __future__ import annotations

import traceback
import base64
from uuid import uuid4
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from seleniumwire.webdriver import Chrome
from seleniumwire.request import Request
from selenium.webdriver.common.by import By
from pathlib import Path
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from pydash import py_

FILE_PATH = Path("./files")


def salvar_tela(webdriver: WebDriver):
    file_path = str(FILE_PATH.resolve().joinpath(f"prints/{uuid4()!s}.png"))
    webdriver.save_screenshot(file_path)
    print(f">> print {webdriver.current_url=} / {file_path=}")


class ChromeWebDriver:
    WINDOW_SIZE = "1920x1080"  # "1366x768"

    def __init__(self, implicitly_wait: int = 0, headers: dict[str, str] = None):
        self._webdriver: WebDriver = None
        self._implicitly_wait = implicitly_wait
        self._headers = {}
        if headers and isinstance(headers, dict):
            self._headers.update(headers)
        self._options = Options()
        prefs = {
            "download.default_directory": str(FILE_PATH.resolve()),
            "download.prompt_for_download": False,
            "directory_upgrade": True,
            "safebrowsing.enabled": True,
        }
        self._options.add_experimental_option("prefs", prefs)
        self._options.add_experimental_option(
            "excludeSwitches", ["load-extension", "enable-automation"]
        )
        self._options.add_argument("--disable-extensions")
        self._options.add_argument("--headless")
        self._options.add_argument("--no-sandbox")
        # self._options.add_argument("--disable-dev-shm-usage")
        self._options.add_argument("window-size=" + ",".join(self.WINDOW_SIZE.split("x")))  # self._options.add_argument("window-size=1366,768")

    def __enter__(self) -> WebDriver:
        return self.webdriver

    def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
        if exc_type:
            print("".join(traceback.format_exception(exc_type, exc_value, exc_traceback)))
        self.quit()
        self.__del__()

    def __del__(self) -> None:
        del self

    def _interceptor(self, request: Request):
        for hk, hv in self._headers.items():
            if request.headers.get(hk):
                del request.headers[hk]
            request.headers[hk] = hv

    @property
    def webdriver(self) -> WebDriver:
        if not self._webdriver:
            self._webdriver = Chrome("chromedriver", options=self._options)
            self._webdriver.request_interceptor = self._interceptor
            if self._implicitly_wait:
                self._webdriver.implicitly_wait(self._implicitly_wait)
        return self._webdriver

    def quit(self):
        if self._webdriver:
            self._webdriver.quit()


class MangaPagina:
    def __init__(self, manga_capitulo_id: int, numero: int, imagem: bytes, extensao: str):
        self.numero = numero
        self.imagem = imagem
        self.extensao = extensao
        self.manga_capitulo_id = manga_capitulo_id
        self.id = id(self)

    def __str__(self) -> str:
        return f"<MangaPagina id='{self.id!s}'>"

    def __repr__(self) -> str:
        return str(self)


class MangaCapitulo:
    def __init__(self, nome: str, capitulo: str, qtde_paginas: int = 0):
        self.nome = nome
        self.capitulo = capitulo
        self.qtde_paginas = qtde_paginas
        self.id = id(self)
        self.paginas: list[MangaPagina] = []

    def __str__(self) -> str:
        return f"<MangaCapitulo id='{self.id!s}'>"
        
    def __repr__(self) -> str:
        return str(self)

    def adicionar_pagina(self, pagina: MangaPagina):
        self.paginas.append(pagina)
        
    def pagina_em_html(self, numero_pagina: int = 1) -> str:
        b64 = base64.b64encode(self.paginas[numero_pagina-1].imagem).decode("utf-8")        
        return f'<img src="data:image/{self.paginas[numero_pagina-1].extensao};base64,{b64}"/>'
        
    @property
    def _nome_arquivo(self) -> str:
        return py_.camel_case(f"{self.nome} {self.capitulo}")

    def gerar_cbz(self):
        nome_arquivo = self._nome_arquivo + ".cbz"
        temp_path = FILE_PATH.resolve().joinpath(self._nome_arquivo)
        file_path = FILE_PATH.resolve().joinpath(nome_arquivo)
        temp_path.mkdir()
        for pagina in self.paginas:
            pagina_path = str(temp_path.joinpath("PAG-" + str(pagina.numero).zfill(3) + "." + pagina.extensao))
            with open(pagina_path, "wb") as fout:
                fout.write(pagina.imagem)
        temp_path.rmdir()
        print(f"Arquivo *.cbz gerado em: {file_path}")

    def gerar_pdf(self):
        nome_arquivo = self._nome_arquivo + ".pdf"
        temp_path = FILE_PATH.resolve().joinpath(self._nome_arquivo)
        file_path = FILE_PATH.resolve().joinpath(nome_arquivo)
        temp_path.mkdir()
        for pagina in self.paginas:
            pagina_path = str(temp_path.joinpath("PAG-" + str(pagina.numero).zfill(3) + "." + pagina.extensao))
            with open(pagina_path, "wb") as fout:
                fout.write(pagina.imagem)
        temp_path.rmdir()
        print(f"Arquivo *.cbz gerado em: ")


class MangaLivre:
    DELAY = 10
    
    def __init__(self):
        self._chrome_webdriver = ChromeWebDriver(implicitly_wait=self.DELAY)

    def _obter_scans(self, url: str) -> dict[int, str]:
        with self._chrome_webdriver as webdriver:
            pagina = {}
            xpath_image = '//div[@class="manga-image"]/picture/img'
            xpeth_next = '//div[@class="page-next"]'
            webdriver.get(url)
            num_pag = 1
            while True:
                if "#comments" in webdriver.current_url:
                    break
                elem = webdriver.find_element(By.XPATH, xpath_image)
                img_src = elem.get_attribute("src")
                pagina[str(num_pag).zfill(3)]["raw"] = requests.get(img_src).content
                pagina[str(num_pag).zfill(3)]["ext"] = img_src.split(".").pop()
                webdriver.find_element(By.XPATH, xpeth_next).click()
                num_pag += 1
        return pagina

    def extrair_manga(self, nome: str, url: str) -> MangaCapitulo:
        capitulo = url.split("/").pop()
        pag_scans = self._obter_scans(url)
        manga = MangaCapitulo(nome=nome, capitulo=capitulo, qtde_paginas=len(list(pag_scans.keys())))
        for pag, img in pag_scans.items():
            pagina = MangaPagina(manga_capitulo_id=manga.id, numero=int(pag), imagem=img["raw"], extensao=img["ext"])
            manga.adicionar_pagina(pagina)
        return manga


## Extração

In [None]:
mangalivre = MangaLivre()
manga = mangalivre.extrair_manga(
    nome="One Punch Man",
    url="https://mangalivre.net/ler/one-punch-man/online/428424/211",
)
manga

In [None]:
from IPython import display

display.HTML(manga.pagina_em_html(13))

## Desenvolvimento

In [None]:
manga.paginas[5].imagens

In [None]:
chrome_webdriver = ChromeWebDriver()
webdriver = chrome_webdriver.webdriver
webdriver

In [None]:
webdriver.get("https://mangalivre.net/ler/one-punch-man/online/428424/211")

In [None]:
SCROLL_PAUSE_TIME = 0.5

last_height = webdriver.execute_script("return document.body.scrollHeight")

while True:
    webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE_TIME)
    new_height = webdriver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


In [None]:
salvar_tela(webdriver)

In [None]:
xpath_image = '//div[@class="manga-image"]/picture/img'
images = []
for elem in webdriver.find_elements(By.XPATH, xpath_image):
    images.append(elem.get_attribute("src"))
    salvar_tela(webdriver)
images

In [None]:
webdriver.quit()