-
-
Notifications
You must be signed in to change notification settings - Fork 288
/
uukanshu.py
74 lines (60 loc) · 2.87 KB
/
uukanshu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
import logging
from bs4 import Tag
from lncrawl.core.crawler import Crawler
from lncrawl.models import Chapter, Volume
from sources.zh.uukanshu_sj import UukanshuOnlineSJ
logger = logging.getLogger(__name__)
novel_search_url = "%ssearch.aspx?k=%s"
class UukanshuOnline(Crawler):
# www is simplified cn, tw is traditional cn but both use same site structure
base_url = ["https://www.uukanshu.net/", "https://tw.uukanshu.net/"]
encoding = "gbk"
def initialize(self):
# the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
self.init_parser("html.parser")
def read_novel_info(self) -> None:
# the encoding for tw is utf-8, for www. is gbk -> otherwise output is messed up with wrong symbols.
if "tw." in self.novel_url:
self.encoding = "utf-8"
soup = self.get_soup(self.novel_url, encoding=self.encoding)
info = soup.select_one("dl.jieshao")
assert info # if this fails, HTML structure has fundamentally changed -> needs update
meta = info.select_one("dd.jieshao_content")
img = info.select_one("dt.jieshao-img img")
if img:
self.novel_cover = self.absolute_url(img["src"])
self.novel_title = meta.select_one("h1 > a").text
self.novel_author = meta.select_one("h2 > a").text
self.novel_synopsis = meta.select_one("h3 > p").text
chapters = soup.select_one("ul#chapterList")
for chapter in list(chapters.children)[::-1]: # reverse order as it's newest to oldest
# convince typehint that we're looking at Tags & also make sure we skip random text within the ul if any
if not isinstance(chapter, Tag):
continue
# find chapters
if chapter.has_attr("class") and "volume" in chapter["class"]:
self.volumes.append(
Volume(
id=len(self.volumes) + 1,
title=chapter.text.strip(),
)
)
continue
anchor = chapter.select_one("a")
if not anchor:
logger.warning("Found <li> in chapter list, not volume, without link: %s", chapter)
continue
self.chapters.append(
Chapter(
id=len(self.chapters) + 1,
url=self.absolute_url(anchor["href"]),
title=anchor.text,
volume=len(self.volumes),
)
)
def download_chapter_body(self, chapter: Chapter) -> str:
soup = self.get_soup(chapter.url, encoding=self.encoding)
content = soup.select_one("div#contentbox")
# use same filters as already implemented on essentially same site
return UukanshuOnlineSJ.format_text(self.cleaner.extract_contents(content))