# MANGALIVRE
Tem como objetivo raspar o site MangaLivre e gravar o(s) manga(s) desejado(s) nos formatos html, cbz ou pdf.

## Setup

### Dependencias do SO
É necessario ter o `Google-Chrome` instalado.

A instalação do `Chrome` no `Ubuntu` de forma manual:
```sh
sudo apt-get install -y curl unzip xvfb libxi6 libgconf-2-4 && \
    wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
    sudo apt install ./google-chrome-stable_current_amd64.deb -y && \
    google-chrome --version
```

### Dependencias do Python
Se tiver o `python-poetry`:
```sh
poetry install
```
Se não tiver, executar:
```sh
pip install pydash selenium-wire Pillow requests
```

### Encerrar o Google Chrome
Em caso de erro, pode acontecer do Google Chrome ficar ativo mas como um zumbi. O comando abaixo finaliza todas as instancias ativas:
```sh
kill -9 $(ps aux | grep chrome | awk '{print $2}')
```

## WebDriver + MangaLivre

In [1]:

from __future__ import annotations

import traceback
import base64
from uuid import uuid4
import requests
from pydash import py_
from PIL import Image
from io import BytesIO
from zipfile import ZipFile
import time
from app.core.chrome_webdriver import ChromeWebDriver
from app.core.webdriver_tools import WebDriverTools
from app import config
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from pathlib import Path


class MangaPagina:
    def __init__(
        self, manga_capitulo_id: int, numero: int, imagem: bytes, extensao: str
    ):
        self.numero = numero
        self.imagem = imagem
        self.extensao = extensao
        self.manga_capitulo_id = manga_capitulo_id
        self.id = id(self)

    def __str__(self) -> str:
        return f"<MangaPagina id='{self.id!s}'>"

    def __repr__(self) -> str:
        return str(self)

    @property
    def imagem_io(self):
        file = BytesIO(self.imagem)
        file.seek(0)
        return file


class MangaCapitulo:
    def __init__(self, nome: str, capitulo: str, qtde_paginas: int = 0):
        self.nome = nome
        self.capitulo = capitulo
        self.qtde_paginas = qtde_paginas
        self.id = id(self)
        self.paginas: list[MangaPagina] = []

    def __str__(self) -> str:
        return f"<MangaCapitulo id='{self.id!s}'>"

    def __repr__(self) -> str:
        return str(self)

    def adicionar_pagina(self, pagina: MangaPagina):
        self.paginas.append(pagina)

    def base64_html(self, numero_pagina: int = 1) -> str:
        b64 = base64.b64encode(self.paginas[numero_pagina - 1].imagem).decode("utf-8")
        return f"data:image/{self.paginas[numero_pagina-1].extensao};base64,{b64}"

    @property
    def folder_path(self) -> Path:
        folder_path = config.FILE_PATH.joinpath("mangalivre", py_.snake_case(self.nome))
        folder_path.mkdir(parents=True, exist_ok=True)
        return folder_path

    def gerar_cbz(self):
        try:
            file_name = py_.snake_case(f"{self.nome}_{self.capitulo}")
            file_path = self.folder_path.joinpath(file_name + ".cbz")
            with ZipFile(file_path, "w") as zip_file:
                for pagina in self.paginas:
                    zip_file.writestr(
                        f"pag_{pagina.numero:0>3}.{pagina.extensao}",
                        pagina.imagem_io.getvalue(),
                    )
            print(f"Arquivo *.cbz gerado em: {file_path!s}")
        except Exception as err:
            print(str(err))

    def gerar_pdf(self):
        try:
            file_name = py_.snake_case(f"{self.nome}_{self.capitulo}")
            file_path = self.folder_path.joinpath(file_name + ".pdf")
            images = [Image.open(pagina.imagem_io) for pagina in self.paginas]
            images[0].save(
                file_path,
                "PDF",
                resolution=100.0,
                save_all=True,
                append_images=images[1:],
            )
            print(f"Arquivo *.pdf gerado em: {file_path!s}")
        except Exception as err:
            print(str(err))

    def gerar_html(self):
        try:
            file_name = py_.snake_case(f"{self.nome}_{self.capitulo}")
            file_path = self.folder_path.joinpath(file_name + ".html")
            content = ""
            for num_pag in range(len(self.paginas)):
                if num_pag:
                    content += "\n<br><br>"
                content += f'<img src="{self.base64_html(num_pag+1)}" class="rounded mx-auto d-block"/>'
            html = f"""
<!doctype html>
<html lang="pt_BR">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>{file_name}</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-GLhlTQ8iRABdZLl6O3oVMWSktQOp6b7In1Zl3/Jr59b6EGGoI1aFkw7cmDA6j6gD" crossorigin="anonymous">
  <style>
    .selector-for-some-widget {{
        box-sizing: content-box;
    }}
  </style>
  </head>
  <body>
    <div class="container-fluid">
        <h1 class="display-2 text-center">{self.nome} {self.capitulo!s}</h1>
        <br><br>
        {content}
        <br><br>
    </div>
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js" integrity="sha384-w76AqPfDkMBDXo30jS1Sgez6pr3x5MlQ1ZAGC+nuZB+EYdgRZgiwxhTBTkF7CXvN" crossorigin="anonymous"></script>
  </body>
</html>
        """
            file_path.write_text(html.strip())
            print(f"Arquivo *.html gerado em: {file_path!s}")
        except Exception as err:
            print(str(err))


class MangaLivre:
    DELAY = 5
    VISIBILITY_TIME = 10
    INFINITE_SCROLL_TIME = 5

    def __init__(self, **kwargs):
        self._chrome_webdriver = ChromeWebDriver(implicitly_wait=self.DELAY, **kwargs)

    def _obter_scans(self, url: str) -> dict[int, str]:
        with self._chrome_webdriver as webdriver:
            pagina = {}
            xpath_image = '//div[@class="manga-image"]/picture/img'
            xpath_next = '//div[@class="page-next"]'
            webdriver.get(url)
            num_pag = 1
            while True:
                if "#comments" in webdriver.current_url:
                    break
                elem = WebDriverWait(webdriver, self.VISIBILITY_TIME).until(
                    EC.visibility_of_element_located((By.XPATH, xpath_image))
                )
                img_src = elem.get_attribute("src")
                pagina[(key_pag := str(num_pag).zfill(3))] = {}
                pagina[key_pag]["raw"] = requests.get(img_src).content
                pagina[key_pag]["ext"] = img_src.split(".").pop()
                next_button = WebDriverWait(webdriver, self.VISIBILITY_TIME).until(
                    EC.visibility_of_element_located((By.XPATH, xpath_next))
                )
                next_button.click()
                num_pag += 1
        return pagina

    def listar_capitulos(self, url: str) -> list[str]:
        urls = []
        with self._chrome_webdriver as webdriver:
            webdriver.get(url)
            screen_height = webdriver.execute_script("return window.screen.height;")
            i = 1
            while True:
                webdriver.execute_script(
                    "window.scrollTo(0, {screen_height}*{i});".format(
                        screen_height=screen_height, i=i
                    )
                )
                i += 1
                time.sleep(self.INFINITE_SCROLL_TIME)
                scroll_height = webdriver.execute_script(
                    "return document.body.scrollHeight;"
                )
                if (screen_height) * i > scroll_height:
                    break
            for elem in webdriver.find_elements(
                By.XPATH, '//ul[@class="full-chapters-list list-of-chapters"]/li'
            ):
                anchor = elem.find_element(By.TAG_NAME, "a")
                urls.append(anchor.get_attribute("href"))
        return urls

    def extrair_manga(self, nome: str, url: str, capitulo: str = "") -> MangaCapitulo:
        capitulo = capitulo or url.split("/").pop()
        pag_scans = self._obter_scans(url)
        manga = MangaCapitulo(
            nome=nome, capitulo=capitulo, qtde_paginas=len(list(pag_scans.keys()))
        )
        for pag, img in pag_scans.items():
            pagina = MangaPagina(
                manga_capitulo_id=manga.id,
                numero=int(pag),
                imagem=img["raw"],
                extensao=img["ext"],
            )
            manga.adicionar_pagina(pagina)
        return manga


## Listar Capitulos

In [3]:
url = "https://mangalivre.net/manga/one-punch-man/1036"

mangalivre = MangaLivre()
capitulos = mangalivre.listar_capitulos(url=url)

capitulos

['https://mangalivre.net/ler/one-punch-man/online/435276/212',
 'https://mangalivre.net/ler/one-punch-man/online/428424/211',
 'https://mangalivre.net/ler/one-punch-man/online/425379/210',
 'https://mangalivre.net/ler/one-punch-man/online/422663/209',
 'https://mangalivre.net/ler/one-punch-man/online/419498/208',
 'https://mangalivre.net/ler/one-punch-man/online/416127/207',
 'https://mangalivre.net/ler/one-punch-man/online/410455/206',
 'https://mangalivre.net/ler/one-punch-man/online/402716/172',
 'https://mangalivre.net/ler/one-punch-man/online/399519/171',
 'https://mangalivre.net/ler/one-punch-man/online/396311/170',
 'https://mangalivre.net/ler/one-punch-man/online/393113/169',
 'https://mangalivre.net/ler/one-punch-man/online/390210/168-2',
 'https://mangalivre.net/ler/one-punch-man/online/387241/168',
 'https://mangalivre.net/ler/one-punch-man/online/384031/167-2',
 'https://mangalivre.net/ler/one-punch-man/online/381501/167',
 'https://mangalivre.net/ler/one-punch-man/online/3

## Extração

In [2]:
nome = "one-punch-man"
url = "https://mangalivre.net/ler/one-punch-man/online/428424/211"

mangalivre = MangaLivre(headless=False)
manga = mangalivre.extrair_manga(nome=nome, url=url)

manga.gerar_pdf()
manga.gerar_html()
manga.gerar_cbz()
manga

Arquivo *.pdf gerado em: files/mangalivre/one_punch_man/one_punch_man_211.pdf
Arquivo *.html gerado em: files/mangalivre/one_punch_man/one_punch_man_211.html
Arquivo *.cbz gerado em: files/mangalivre/one_punch_man/one_punch_man_211.cbz


<MangaCapitulo id='139803046911456'>

In [None]:
manga.gerar_pdf()


## Extração em Massa

In [None]:
nome = "One Punch Man"
url = "https://mangalivre.net/manga/one-punch-man/1036"

mangalivre = MangaLivre()
capitulos = mangalivre.listar_capitulos(url=url)

for url_capitulo in capitulos:
    try:
        mangalivre = MangaLivre()
        manga = mangalivre.extrair_manga(nome=nome, url=url_capitulo)
        manga.gerar_pdf()
        manga.gerar_html()
        manga.gerar_cbz()
    except Exception as err:
        print(f"Erro ao capturar a url: {url_capitulo}!")