-
-
Notifications
You must be signed in to change notification settings - Fork 288
/
faqwiki.py
143 lines (119 loc) · 5.51 KB
/
faqwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
import logging
from bs4.element import Tag
from lncrawl.core.crawler import Crawler
from lncrawl.models import Volume, Chapter, SearchResult
logger = logging.getLogger(__name__)
class FaqWiki(Crawler):
base_url = ["https://faqwiki.us/"]
has_manga = False
has_mtl = True
def initialize(self) -> None:
# There's about 4+ ads as img tags within each chapter.
# Have not yet seen an img be part of any chapter, worst case we'll miss out on it.
self.cleaner.bad_tags.add("img")
def read_novel_info(self):
soup = self.get_soup(self.novel_url)
content = soup.select_one(".entry-content")
entry_title = soup.select_one("h1.entry-title")
assert isinstance(entry_title, Tag) # this must be here, is part of normal site structure/framework
self.novel_title = entry_title.text.strip()
# remove suffix from completed novels' title
if self.novel_title.endswith(" – All Chapters"):
self.novel_title = self.novel_title[0:self.novel_title.find(" – All Chapters")]
self.novel_author = "FaqWiki"
cover = content.select_one('.wp-block-image img')
# is missing in some rarer cases
if cover:
src = str(cover['src'])
# may be replaced with JS after load, in such case try and get the real img hidden in data-values
if src.startswith("data:"):
try:
src = cover["data-ezsrc"]
except KeyError:
pass
self.novel_cover = self.absolute_url(src)
# remove any optimized image size GET args from novel cover URL
if self.novel_cover and "?" in self.novel_cover:
self.novel_cover = self.novel_cover[0:self.novel_cover.find("?")]
metadata_container = soup.select_one("div.book-review-block__meta-item-value")
keywords = {
"desc": "Description:",
"alt_name": "Alternate Names:",
"genre": "Genre:",
"author": "Author(s):",
"status": "Status:",
"original_pub": "Original Publisher:"
}
if metadata_container:
metadata = metadata_container.text # doesn't have line breaks anyway so not splitting here
pos_dict = {}
for key, sep in keywords.items():
pos_dict[key + "_start"] = metadata.find(sep)
pos_dict[key] = metadata.find(sep) + len(sep)
self.novel_synopsis = metadata[pos_dict["desc"]:pos_dict["alt_name_start"]].strip()
self.novel_tags = metadata[pos_dict["genre"]:pos_dict["author_start"]].strip().split(" ")
self.novel_author = metadata[pos_dict["author"]:pos_dict["status_start"]].strip()
logger.info("Novel title: %s", self.novel_title)
logger.info("Novel synopsis: %s", self.novel_synopsis)
logger.info("Novel tags: %s", ",".join(self.novel_tags))
logger.info("Novel author: %s", self.novel_author)
logger.info("Novel cover: %s", self.novel_cover)
chap_list = soup.select_one('#lcp_instance_0').select("li>a")
for idx, a in enumerate(chap_list):
if "chapter" not in a.text.lower():
continue
chap_id = 1 + idx
vol_id = 1 + len(self.chapters) // 100
vol_title = f"Volume {vol_id}"
if chap_id % 100 == 1:
self.volumes.append(
Volume(
id=vol_id,
title=vol_title
))
# chapter name is only (sometimes) present in chapter page, not in overview
entry_title = f"Chapter {chap_id}"
self.chapters.append(
Chapter(
id=chap_id,
url=self.absolute_url(a["href"]),
title=entry_title,
volume=vol_id,
volume_title=vol_title
),
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url)
contents_html = soup.select_one("div.entry-content")
contents_html = self.cleaner.clean_contents(contents_html)
contents_str = self.cleaner.extract_contents(contents_html)
return contents_str
def search_novel(self, query: str):
novel_selector = "article > div > header > h3.entry-title > a"
next_selector = "div.nav-links > a.next"
soup = self.get_soup(f"https://faqwiki.us/?s={query.replace(' ','+')}&post_type=page")
empty = "nothing found" in soup.select_one("h1.page-title").text.strip().lower()
if empty:
return []
novels = soup.select(novel_selector)
# loop over all pages via next button and get all novels
next_page = soup.select_one(next_selector)
while next_page:
page_soup = self.get_soup(self.absolute_url(next_page["href"]))
novels += page_soup.select(novel_selector)
next_page = page_soup.select_one(next_selector)
results = []
for novel in novels:
# filter out "fake" novels (links to All, completed & ongoing pages)
if "novels" in novel.text.lower():
pass
# simple but at least won't taint results
if query.lower() in novel.text.lower():
results.append(
SearchResult(
title=novel.text,
url=novel["href"]
)
)
return results