-
-
Notifications
You must be signed in to change notification settings - Fork 288
/
wtrlab.py
137 lines (111 loc) · 4.94 KB
/
wtrlab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
import logging
import json
import re
import requests
from lncrawl.core.crawler import Crawler
from lncrawl.models import Volume, Chapter, SearchResult
logger = logging.getLogger(__name__)
class WtrLab(Crawler):
"""
This site has multilingual novels, basically all MTL, supposedly through translators like Google
but the output seems pretty decent for that
The whole site obfuscates classes via Angular or some type of minifier
so the only constant HTML comes from display text & HTML IDs
But luckily all necessary data is stored in a consistent JSON that's always in the same script tag
Essentially the same framework as webfic though with some other keys, urls, etc.
"""
base_url = ["https://wtr-lab.com/"]
has_manga = False
has_mtl = True
host = ""
def initialize(self) -> None:
self.host = self.home_url[:-1] if self.home_url.endswith("/") else self.home_url
def read_novel_info(self):
soup = self.get_soup(self.novel_url)
metadata_json = soup.select_one('script#__NEXT_DATA__')
metadata = json.loads(metadata_json.text)
assert metadata # this is where we get everything so it's kinda required
novel_slug = metadata['props']['pageProps']['serie']['serie_data']['slug']
self.novel_title = metadata['props']['pageProps']['serie']['serie_data']['data']['title']
self.novel_cover = metadata['props']['pageProps']['serie']['serie_data']['data']['image']
try:
self.novel_tags = [tag["title"] for tag in metadata['props']['pageProps']['tags']]
except KeyError:
# worst case we miss out on tags
pass
self.novel_synopsis = metadata['props']['pageProps']['serie']['serie_data']['data']['description']
self.novel_author = metadata['props']['pageProps']['serie']['serie_data']['data']['author']
logger.info("book metadata %s", metadata)
# examples:
lang = re.match(f"{self.host}(/?.*)/serie-\\d+/.+/?", self.novel_url).group(1)
# lang will be something like "/en" or "/es"
self.language = lang[1:]
serie_id = metadata['props']['pageProps']['serie']['serie_data']['raw_id']
for idx, chapter in enumerate(metadata['props']['pageProps']['serie']['chapters']):
chap_id = 1 + idx
vol_id = 1 + len(self.chapters) // 100
vol_title = f"Volume {vol_id}"
url = f"{self.host}{lang}/serie-{serie_id}/{novel_slug}/chapter-{chapter['slug']}"
if chap_id % 100 == 1:
self.volumes.append(
Volume(
id=vol_id,
title=vol_title
))
self.chapters.append(
Chapter(
id=chap_id,
url=url,
title=chapter["title"],
volume=vol_id,
volume_title=vol_title
),
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url)
chapter_metadata = soup.select_one("script#__NEXT_DATA__")
chapter_json = json.loads(chapter_metadata.text)
assert chapter_json
logger.info("chapeter %s", chapter_json)
# adjust chapter title as the one from the overview usually lacks details
chapter.title = chapter_json['props']['pageProps']['serie']['chapter_data']['data']['title']
# get all text
text_lines = chapter_json['props']['pageProps']['serie']['chapter_data']['data']['body']
# copied straight outta self.cleaner.extract_contents because we lack a TAG...
# otherwise the output looks very mushed together cause it ignores all the newlines otherwise
text = "".join(
[
f"<p>{t.strip()}</p>"
for t in text_lines
if not self.cleaner.contains_bad_texts(t)
]
)
return text
def search_novel(self, query: str):
novels = requests.post(f"{self.host}/api/search", json={"text": query}).json()
logger.info("Search results: %s", novels)
for novel in novels["data"]:
data = novel["data"]
meta = {
"Chapters": novel["chapter_count"],
"Status": self.status_idx_to_text(novel["status"]),
"Author": data["author"]
}
info = " | ".join(f"{k}: {v}" for k, v in meta.items())
yield SearchResult(
title=data["title"],
# default to EN
url=f"{self.host}/en/serie-{novel['raw_id']}/f{novel['slug']}",
info=info
)
return []
@staticmethod
def status_idx_to_text(idx):
return "Ongoing" if idx else "Completed"
@staticmethod
def premium_idx_to_text(idx):
if idx == 2:
return "Partial Paywall"
else:
return "Unknown"