# EXTRAÇÃO DE MANGA PELO SITE MANGALIVRE

## Setup

### Dependencias do SO
É necessario fazer a instalação das seguintes dependencias do SO Linux (Ubuntu):
1) Instale o Google Chrome
```sh
sudo apt-get install -y curl unzip xvfb libxi6 libgconf-2-4 && \
    wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
    sudo apt install ./google-chrome-stable_current_amd64.deb -y && \
    google-chrome --version
```
2) Instale o Chromedriver baseado na versão no Google-Chrome. Nesse momento a versão é a `108.0.5359.71`:
```sh
wget https://chromedriver.storage.googleapis.com/108.0.5359.71/chromedriver_linux64.zip && \
    unzip chromedriver_linux64.zip && \
    sudo mv chromedriver /usr/bin/chromedriver && \
    sudo chown root:root /usr/bin/chromedriver && \
    sudo chmod +x /usr/bin/chromedriver
```
> - [https://chromedriver.chromium.org/](https://chromedriver.chromium.org/)

### Dependencias do Python
```sh
poetry add pydash selenium-wire
```

## Encerrar o Google Chrome 
```sh
kill -9 $(ps aux | grep chrome | awk '{print $2}')
```

## WebDriver + MangaLivre

In [None]:
from __future__ import annotations

import traceback
import base64
from uuid import uuid4
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from seleniumwire.webdriver import Chrome
from seleniumwire.request import Request
from selenium.webdriver.common.by import By
from pathlib import Path
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from pydash import py_
from PIL import Image 
from io import BytesIO

FILE_PATH = Path("./files")


def salvar_tela(webdriver: WebDriver):
    file_path = str(FILE_PATH.resolve().joinpath(f"prints/{uuid4()!s}.png"))
    webdriver.save_screenshot(file_path)
    print(f">> print {webdriver.current_url=} / {file_path=}")


class ChromeWebDriver:
    WINDOW_SIZE = "1920x1080"  # "1366x768"

    def __init__(self, implicitly_wait: int = 0, headers: dict[str, str] = None):
        self._webdriver: WebDriver = None
        self._implicitly_wait = implicitly_wait
        self._headers = {}
        if headers and isinstance(headers, dict):
            self._headers.update(headers)
        self._options = Options()
        prefs = {
            "download.default_directory": str(FILE_PATH.resolve()),
            "download.prompt_for_download": False,
            "directory_upgrade": True,
            "safebrowsing.enabled": True,
        }
        self._options.add_experimental_option("prefs", prefs)
        self._options.add_experimental_option(
            "excludeSwitches", ["load-extension", "enable-automation"]
        )
        self._options.add_argument("--disable-extensions")
        self._options.add_argument("--headless")
        self._options.add_argument("--no-sandbox")
        # self._options.add_argument("--disable-dev-shm-usage")
        self._options.add_argument("window-size=" + ",".join(self.WINDOW_SIZE.split("x")))  # self._options.add_argument("window-size=1366,768")

    def __enter__(self) -> WebDriver:
        return self.webdriver

    def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
        if exc_type:
            print("".join(traceback.format_exception(exc_type, exc_value, exc_traceback)))
        self.quit()
        self.__del__()

    def __del__(self) -> None:
        del self

    def _interceptor(self, request: Request):
        for hk, hv in self._headers.items():
            if request.headers.get(hk):
                del request.headers[hk]
            request.headers[hk] = hv

    @property
    def webdriver(self) -> WebDriver:
        if not self._webdriver:
            self._webdriver = Chrome("chromedriver", options=self._options)
            self._webdriver.request_interceptor = self._interceptor
            if self._implicitly_wait:
                self._webdriver.implicitly_wait(self._implicitly_wait)
        return self._webdriver

    def quit(self):
        if self._webdriver:
            self._webdriver.quit()


class MangaPagina:
    def __init__(self, manga_capitulo_id: int, numero: int, imagem: bytes, extensao: str):
        self.numero = numero
        self.imagem = imagem
        self.extensao = extensao
        self.manga_capitulo_id = manga_capitulo_id
        self.id = id(self)

    def __str__(self) -> str:
        return f"<MangaPagina id='{self.id!s}'>"

    def __repr__(self) -> str:
        return str(self)

    @property
    def imagem_io(self):
        file = BytesIO(self.imagem)
        file.seek(0)
        return file


class MangaCapitulo:
    def __init__(self, nome: str, capitulo: str, qtde_paginas: int = 0):
        self.nome = nome
        self.capitulo = capitulo
        self.qtde_paginas = qtde_paginas
        self.id = id(self)
        self.paginas: list[MangaPagina] = []

    def __str__(self) -> str:
        return f"<MangaCapitulo id='{self.id!s}'>"
        
    def __repr__(self) -> str:
        return str(self)

    def adicionar_pagina(self, pagina: MangaPagina):
        self.paginas.append(pagina)
        
    def base64_html(self, numero_pagina: int = 1) -> str:
        b64 = base64.b64encode(self.paginas[numero_pagina-1].imagem).decode("utf-8")        
        return f"data:image/{self.paginas[numero_pagina-1].extensao};base64,{b64}"
        
    def gerar_pdf(self):
        file_name = py_.snake_case(f"{self.nome}_{self.capitulo}")
        file_path = FILE_PATH.resolve().joinpath(file_name + ".pdf")
        images = [Image.open(pagina.imagem_io) for pagina in self.paginas]
        images[0].save(file_path, "PDF", resolution=100.0, save_all=True, append_images=images[1:])
        print(f"Arquivo *.pdf gerado em: {file_path!s}")

    def gerar_html(self):
        file_name = py_.snake_case(f"{self.nome}_{self.capitulo}")
        file_path = FILE_PATH.resolve().joinpath(file_name  + ".html")
        content = ""
        for num_pag in range(len(self.paginas)):
            if num_pag:
                content += "\n<br><br>"
            content += f'<img src="{self.base64_html(num_pag+1)}" class="rounded mx-auto d-block"/>'
        html = f"""
<!doctype html>
<html lang="pt_BR">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>{file_name}</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-GLhlTQ8iRABdZLl6O3oVMWSktQOp6b7In1Zl3/Jr59b6EGGoI1aFkw7cmDA6j6gD" crossorigin="anonymous">
  <style>
    .selector-for-some-widget {{
        box-sizing: content-box;
    }}
  </style>
  </head>
  <body>
    <div class="container">
        <br><br>
        {content}
        <br><br>
    </div>
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js" integrity="sha384-w76AqPfDkMBDXo30jS1Sgez6pr3x5MlQ1ZAGC+nuZB+EYdgRZgiwxhTBTkF7CXvN" crossorigin="anonymous"></script>
  </body>
</html>
        """
        file_path.write_text(html.strip())
        print(f"Arquivo *.html gerado em: {file_path!s}")


class MangaLivre:
    DELAY = 10
    VISIBILITY_TIME = 120
    
    def __init__(self):
        self._chrome_webdriver = ChromeWebDriver(implicitly_wait=self.DELAY)

    def _obter_scans(self, url: str) -> dict[int, str]:
        with self._chrome_webdriver as webdriver:
            pagina = {}
            xpath_image = '//div[@class="manga-image"]/picture/img'
            xpeth_next = '//div[@class="page-next"]'
            webdriver.get(url)
            num_pag = 1
            while True:
                if "#comments" in webdriver.current_url:
                    break
                elem = WebDriverWait(webdriver, self.VISIBILITY_TIME).until(EC.visibility_of_element_located((By.XPATH, xpath_image)))
                img_src = elem.get_attribute("src")
                pagina[(key_pag := str(num_pag).zfill(3))] = {}
                pagina[key_pag]["raw"] = requests.get(img_src).content
                pagina[key_pag]["ext"] = img_src.split(".").pop()
                webdriver.find_element(By.XPATH, xpeth_next).click()
                num_pag += 1
        return pagina

    def extrair_manga(self, nome: str, url: str) -> MangaCapitulo:
        capitulo = url.split("/").pop()
        pag_scans = self._obter_scans(url)
        manga = MangaCapitulo(nome=nome, capitulo=capitulo, qtde_paginas=len(list(pag_scans.keys())))
        for pag, img in pag_scans.items():
            pagina = MangaPagina(manga_capitulo_id=manga.id, numero=int(pag), imagem=img["raw"], extensao=img["ext"])
            manga.adicionar_pagina(pagina)
        return manga


## Extração

In [None]:
mangalivre = MangaLivre()
manga = mangalivre.extrair_manga(
    nome="One Punch Man",
    url="https://mangalivre.net/ler/one-punch-man/online/428424/211",
)
manga.gerar_pdf()
manga.gerar_html()
manga

In [None]:
from IPython import display

display.HTML(f'<img src="{manga.base64_html(13)}"/>')

## Desenvolvimento

In [None]:
chrome_webdriver = ChromeWebDriver()
webdriver = chrome_webdriver.webdriver
webdriver

In [None]:
webdriver.get("https://mangalivre.net/ler/one-punch-man/online/428424/211")

In [None]:
webdriver.quit()