-
-
Notifications
You must be signed in to change notification settings - Fork 293
/
volarenovels.py
95 lines (79 loc) · 3.15 KB
/
volarenovels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
import logging
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
book_url = "https://www.volarenovels.com/novel/%s"
search_url = "https://www.volarenovels.com/api/novels/search?query=%s&count=5"
class VolareNovelsCrawler(Crawler):
base_url = "https://www.volarenovels.com/"
def __parse_toc(self, soup):
"""parse and return the toc list"""
volumes = []
chapters = []
for div in soup.select("#TableOfContents #accordion .panel"):
vol = div.select("h4.panel-title span")[0].text.strip()
vol_id = int(vol) if vol.isdigit() else len(volumes) + 1
volumes.append(
{
"id": vol_id,
"title": div.select_one("h4.panel-title .title a").text.strip(),
}
)
for a in div.select(".list-chapters li a"):
chapters.append(
{
"id": len(chapters) + 1,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)
return (volumes, chapters)
def search_novel(self, query):
"""Gets a list of {title, url} matching the given query"""
url = search_url % query
logger.info("Visiting %s ...", url)
data = self.get_json(url)["items"][:5]
# logger.debug(data)
results = []
for item in data:
results.append(
{
"title": item["name"],
"url": book_url % item["slug"],
}
)
return results
def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)
possible_title = soup.select_one("#content-container h3.title")
assert possible_title, "No novel title"
self.novel_title = possible_title.text
logger.info("Novel title: %s", self.novel_title)
try:
self.novel_author = soup.select("#content-container .p-tb-10-rl-30 p")[
1
].text.strip()
except Exception:
pass # not so important to raise errors
logger.info("Novel author: %s", self.novel_author)
try:
self.novel_cover = self.absolute_url(
soup.select_one("#content-container .md-d-table img")["src"]
)
except Exception:
pass # not so important to raise errors
logger.info("Novel cover: %s", self.novel_cover)
# Extract volume-wise chapter entries
# chapter_urls = set([])
self.volumes, self.chapters = self.__parse_toc(soup)
def download_chapter_body(self, chapter):
logger.info("Visiting: %s", chapter["url"])
soup = self.get_soup(chapter["url"])
content = soup.select_one(".fr-view:not(.hidden)")
for bad in content.select(
".chapter-nav, .hidden-text, .__cf_email__, p[data-f-id='pbf'], span[style*=\"font-size: 0\"]"
):
bad.extract()
return str(content)