# EXTRAÇÃO DE MANGA PELO SITE MANGALIVRE

## Setup

É necessario fazer a instalação das seguintes dependencias do SO Linux (Ubuntu):

In [None]:
! sudo apt update -qq && sudo apt install --no-install-recommends -y chromium-chromedriver openssl

Se a opção de cima não funcionar, execute os 2 passos a seguir:

- 1/2 Instale o Google-Chrome:

In [None]:
! sudo apt-get install -y curl unzip xvfb libxi6 libgconf-2-4 && \
    wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
    sudo apt install ./google-chrome-stable_current_amd64.deb -y && \
    google-chrome --version

- 2/2 - Instale o Chromedriver baseado na versão no Google-Chrome. Nesse momento a versão é a `108.0.5359.71`:

In [None]:
! wget https://chromedriver.storage.googleapis.com/108.0.5359.71/chromedriver_linux64.zip && \
    unzip chromedriver_linux64.zip && \
    sudo mv chromedriver /usr/bin/chromedriver && \
    sudo chown root:root /usr/bin/chromedriver && \
    sudo chmod +x /usr/bin/chromedriver

> [https://chromedriver.chromium.org](https://chromedriver.chromium.org)

Após instalado o Chrome/Chromium e Chromedriver, é necessario a instalação das dependencias do Python abaixo:

In [None]:
! poetry add pydash selenium-wire

In [7]:
from __future__ import annotations
import traceback
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from seleniumwire.webdriver import Chrome
from selenium.webdriver.common.by import By
from pathlib import Path
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

FILE_DIR = str(Path(".").resolve())

class ChromeWebDriver:
    def __init__(self, implicitly_wait: int = 0, headers: dict[str, str] = None):
        self._webdriver: WebDriver = None
        self._implicitly_wait = implicitly_wait
        self._headers = {}
        if headers and isinstance(headers, dict):
            self._headers.update(headers)
        self._options = Options()
        prefs = {
            "download.default_directory": FILE_DIR,
            "download.prompt_for_download": False,
            "directory_upgrade": True,
            "safebrowsing.enabled": True,
        }
        self._options.add_experimental_option("prefs", prefs)
        self._options.add_experimental_option(
            "excludeSwitches", ["load-extension", "enable-automation"]
        )
        self._options.add_argument("--disable-extensions")
        self._options.add_argument("--headless")
        self._options.add_argument("--no-sandbox")
        self._options.add_argument("--disable-dev-shm-usage")
        self._options.add_argument("window-size=1366,768")

    def __enter__(self) -> WebDriver:
        return self.webdriver

    def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
        if exc_type:
            print(
                "".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
            )
        self.quit()
        self.__del__()

    def __del__(self) -> None:
        del self

    def _interceptor(self, request):
        for hk, hv in self._headers.items():
            if request.headers.get(hk):
                del request.headers[hk]
            request.headers[hk] = hv

    @property
    def webdriver(self) -> WebDriver:
        if not self._webdriver:
            self._webdriver = Chrome(
                "chromedriver", options=self._options
            )
            self._webdriver.request_interceptor = self._interceptor
            if self._implicitly_wait:
                self._webdriver.implicitly_wait(self._implicitly_wait)
        return self._webdriver

    def quit(self):
        if self._webdriver:
            self._webdriver.quit()


class MangaPagina:
    def __init__(self, manga_capitulo_id: int, pagina: int, imagens: list[str]):
        self.pagina = pagina
        self.imagens: list[str] = imagens
        self.manga_capitulo_id = manga_capitulo_id
        self.id = id(self)

    def __repr__(self) -> str:
        return f"<MangaPagina id='{self.id}'>"


class MangaCapitulo:
    def __init__(self, nome: str, capitulo: str, qtde_paginas: int = 0):
        self.nome = nome
        self.capitulo = capitulo
        self.qtde_paginas = qtde_paginas
        self.id = id(self)
        self.paginas: list[MangaPagina] = []

    def adicionar_pagina(self, pagina: MangaPagina):
        self.paginas.append(pagina)
        
    def __repr__(self) -> str:
        return f"<MangaCapitulo id='{self.id}'>"


class MangaLivre:
    DELAY = 120
    
    def __init__(self):
        self._chrome_webdriver = ChromeWebDriver(implicitly_wait=self.DELAY)
        self._delay = self.DELAY
        self.test = []

    # def _obter_scans(self, url: str) -> dict[int, str]:
    #     with self._chrome_webdriver as webdriver:
    #         pagina = {}
    #         # xpath_image = '//div[@class="manga-image"]/picture/img'
    #         xpath_image = '//img'
    #         webdriver.get(url)
    #         pagina[1] = []
    #         for elem in webdriver.find_elements(By.XPATH, xpath_image):
    #             pagina[1].append(elem.get_attribute("src"))
    #         total = int(webdriver.find_element(By.XPATH, '//em[@reader-total-pages=""]').text)
    #         for n in range(1, total):
    #             webdriver.get(f"{url}#/!page{n!s}")
    #             pagina[n+1] = []
    #             for elem in webdriver.find_elements(By.XPATH, xpath_image):
    #                 pagina[n+1].append(elem.get_attribute("src"))
    #     return pagina

    def _obter_scans(self, url: str) -> dict[int, str]:
        with self._chrome_webdriver as webdriver:
            pagina = {}
            # xpath_image = '//div[@class="manga-image"]/picture/img'
            xpath_image = '//picture/img'
            webdriver.get(url)
            pagina[1] = []
            elem = WebDriverWait(webdriver, self._delay).until(EC.presence_of_element_located((By.XPATH, xpath_image)))
            pagina[1].append(elem.get_attribute("src"))
            total = int(webdriver.find_element(By.XPATH, '//em[@reader-total-pages=""]').text)
            for n in range(1, total):
                print(f"pagina: {n}")
                webdriver.get(f"{url}#/!page{n!s}")
                pagina[n+1] = []
                elem = WebDriverWait(webdriver, self._delay).until(EC.presence_of_element_located((By.XPATH, xpath_image)))
                pagina[n+1].append(elem.get_attribute("src"))
        return pagina

      
    def extrair_manga(self, nome: str, url: str) -> MangaCapitulo:
        capitulo = url.split("/").pop()
        pag_scans = self._obter_scans(url)
        manga = MangaCapitulo(nome=nome, capitulo=capitulo, qtde_paginas=len(list(pag_scans.keys())))
        for pag, imgs in pag_scans.items():
            pagina = MangaPagina(manga_capitulo_id=manga.id, pagina=int(pag), imagens=imgs)
            manga.adicionar_pagina(pagina)
        return manga


## Extração

In [8]:
mangalivre = MangaLivre()
manga = mangalivre.extrair_manga(
    nome="One Punch Man",
    url="https://mangalivre.net/ler/one-punch-man/online/428424/211",
)
manga

pagina: 1
pagina: 2
pagina: 3
pagina: 4
pagina: 5
pagina: 6
pagina: 7
pagina: 8
pagina: 9
pagina: 10
pagina: 11
pagina: 12
pagina: 13
pagina: 14
pagina: 15
pagina: 16
pagina: 17
pagina: 18
pagina: 19
pagina: 20
pagina: 21
pagina: 22
pagina: 23
pagina: 24
pagina: 25
pagina: 26
pagina: 27


<MangaCapitulo id='140140561422304'>

In [9]:
print(manga.__dict__)
for pagina in manga.paginas:
    print(pagina.__dict__)

{'nome': 'One Punch Man', 'capitulo': '211', 'qtde_paginas': 28, 'id': 140140561422304, 'paginas': [<MangaPagina id='140140561427392'>, <MangaPagina id='140140561421728'>, <MangaPagina id='140140561418560'>, <MangaPagina id='140140561427344'>, <MangaPagina id='140140561511520'>, <MangaPagina id='140140605665920'>, <MangaPagina id='140140605654976'>, <MangaPagina id='140140605653680'>, <MangaPagina id='140140563210464'>, <MangaPagina id='140140562347968'>, <MangaPagina id='140140562341776'>, <MangaPagina id='140140562338416'>, <MangaPagina id='140140562349168'>, <MangaPagina id='140140562342496'>, <MangaPagina id='140140610626000'>, <MangaPagina id='140140605626528'>, <MangaPagina id='140140605634928'>, <MangaPagina id='140140605630464'>, <MangaPagina id='140140605633824'>, <MangaPagina id='140140606055440'>, <MangaPagina id='140140606057744'>, <MangaPagina id='140140560666480'>, <MangaPagina id='140140560671184'>, <MangaPagina id='140140560666288'>, <MangaPagina id='140140560678432'>, 

In [None]:
from IPython import display
import requests

for pagina in manga.paginas[10:]:
    for url in pagina.imagens:
        content = requests.get(url).content
        break
    break

display.Image(content)

## Desenvolvimento

In [None]:
chrome_webdriver = ChromeWebDriver()
webdriver = chrome_webdriver.webdriver
webdriver

In [None]:
def imagens(url: str) -> list[str]:
    images = [] 
    xpath_image = '//div[@class="manga-image"]/picture/img'
    webdriver.get(url)
    for elem in webdriver.find_elements(By.XPATH, xpath_image):
        images.append((0, elem.get_attribute("src")))
    
    total = int(webdriver.find_element(By.XPATH, '//em[@reader-total-pages=""]').text)

    for n in range(1, total):
        webdriver.get(f"{url}#/!page{n!s}")        
        for elem in webdriver.find_elements(By.XPATH, xpath_image):
            images.append((n, elem.get_attribute("src")))

    return images

In [None]:
imgs = imagens("https://mangalivre.net/ler/one-punch-man/online/428424/211")
imgs

In [None]:
url = "https://mangalivre.net/ler/one-punch-man/online/428424/211#/!page4"
xpath_image = '//div[@class="manga-image"]/picture/img'
images = []
with ChromeWebDriver() as webdriver:
    webdriver.get(url)
    elems = webdriver.find_elements(By.XPATH, xpath_image)
    for elem in elems:
        images.append((0, elem.get_attribute("src")))


In [None]:
url = "https://mangalivre.net/ler/one-punch-man/online/428424/211#/!page4"
with ChromeWebDriver() as webdriver:
    webdriver.get(url)
    source = webdriver.page_source
    
with open("./files/page.html", "w") as fout:
    fout.write(source)

In [None]:
webdriver.find_element(By.XPATH, '//em[@reader-total-pages=""]').text


In [None]:
from IPython import display

display.HTML(f'<img src="{images[0][1]}"/>')

In [None]:
webdriver.quit()