-
Notifications
You must be signed in to change notification settings - Fork 4
/
index_parser.py
256 lines (213 loc) · 7.03 KB
/
index_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from enum import Enum
from lxml import etree
class TorrentSite(Enum):
"""
Contains torrent sites
"""
NYAASI = "https://nyaa.si"
SUKEBEINYAASI = "https://sukebei.nyaa.si"
# * nyaa.pantsu.cat redirects to nyaa.net
NYAANET = "https://nyaa.net"
SUKEBEINYAANET = "https://sukebei.nyaa.net"
def nyaa_categories(b):
c = b.replace('?c=', '')
cats = c.split('_')
cat = cats[0]
sub_cat = cats[1]
categories = {
"1": {
"name": "Anime",
"sub_cats": {
"1": "Anime Music Video",
"2": "English-translated",
"3": "Non-English-translated",
"4": "Raw"
}
},
"2": {
"name": "Audio",
"sub_cats": {
"1": "Lossless",
"2": "Lossy"
}
},
"3": {
"name": "Literature",
"sub_cats": {
"1": "English-translated",
"2": "Non-English-translated",
"3": "Raw"
}
},
"4": {
"name": "Live Action",
"sub_cats": {
"1": "English-translated",
"2": "Idol/Promotional Video",
"3": "Non-English-translated",
"4": "Raw"
}
},
"5": {
"name": "Pictures",
"sub_cats": {
"1": "Graphics",
"2": "Photos"
}
},
"6": {
"name": "Software",
"sub_cats": {
"1": "Applications",
"2": "Games"
}
}
}
try:
category_name = f"{categories[cat]['name']} - {categories[cat]['sub_cats'][sub_cat]}"
except KeyError:
print("Unable to get Nyaa category name")
return
return category_name
def parse_nyaa(request_text, limit, site):
parser = etree.HTMLParser()
tree = etree.fromstring(request_text, parser)
# Put proper domain here.
uri = site.value
torrents = []
# Going through table rows
for tr in tree.xpath("//tbody//tr")[:limit]:
block = []
for td in tr.xpath("./td"):
for link in td.xpath("./a"):
href = link.attrib.get("href").split('/')[-1]
# Only caring about non-comment pages.
if href[-9:] != "#comments":
block.append(href)
if link.text and link.text.strip():
block.append(link.text.strip())
if td.text is not None and td.text.strip():
block.append(td.text.strip())
# Add type of torrent based on tr class.
if tr.attrib.get("class") is not None:
if 'danger' in tr.attrib.get("class"):
block.append("remake")
elif 'success' in tr.attrib.get("class"):
block.append("trusted")
else:
block.append("default")
else:
block.append("default")
# Decide category.
if site in [TorrentSite.NYAASI, TorrentSite.NYAANET]:
category = nyaa_categories(block[0])
elif site in [TorrentSite.SUKEBEINYAASI, TorrentSite.SUKEBEINYAANET]:
category = sukebei_categories(block[0])
else:
raise ValueError("Unknown TorrentSite received!")
# Create torrent object
try:
torrent = {
'id': block[1],
'category': category,
'url': "{}/view/{}".format(uri, block[1]),
'name': block[2],
'download_url': "{}/download/{}".format(uri, block[3]),
'magnet': block[4],
'size': block[5],
'date': block[6],
'seeders': block[7],
'leechers': block[8],
'completed_downloads': block[9],
'type': block[10]
}
torrents.append(torrent)
except IndexError:
pass
return torrents
def parse_single(request_text, site):
parser = etree.HTMLParser()
tree = etree.fromstring(request_text, parser)
# Put proper domain here.
uri = site.value
torrent = {}
data = []
torrent_files = []
# Find basic uploader info & torrent stats
for row in tree.xpath("//div[@class='row']"):
for div_text in row.xpath("./div[@class='col-md-5']//text()"):
d = div_text.strip()
if d:
data.append(d)
# Find files, we need only text of the li element(s).
# Sorry about Pycodestyle aka PEP8 (E501) error
for el in tree.xpath("//div[contains(@class, 'torrent-file-list')]//li/text()"):
if el.rstrip():
torrent_files.append(el)
torrent['title'] = \
tree.xpath("//h3[@class='panel-title']/text()")[0].strip()
torrent['category'] = data[0]
torrent['uploader'] = data[4]
torrent['uploader_profile'] = "{}/user/{}".format(uri, data[4])
torrent['website'] = data[6]
torrent['size'] = data[8]
torrent['date'] = data[3]
torrent['seeders'] = data[5]
torrent['leechers'] = data[7]
torrent['completed'] = data[9]
torrent['hash'] = data[10]
torrent['files'] = torrent_files
torrent['description'] = ""
for s in tree.xpath("//div[@id='torrent-description']"):
torrent['description'] += s.text
return torrent
def sukebei_categories(b):
c = b.replace('?c=', '')
cats = c.split('_')
cat = cats[0]
subcat = cats[1]
categories = {
"1": {
"name": "Art",
"subcats": {
"1": "Anime",
"2": "Doujinshi",
"3": "Games",
"4": "Manga",
"5": "Pictures",
}
},
"2": {
"name": "Real Life",
"subcats": {
"1": "Photobooks & Pictures",
"2": "Videos"
}
}
}
try:
category_name = f"{categories[cat]['name']} - {categories[cat]['subcats'][subcat]}"
except KeyError:
print("Unable to get Sukebei category name")
return
return category_name
# Pantsu Utils
def query_builder(q, params):
available_params = ["category", "page", "limit", "userID", "fromID",
"status", "maxage", "toDate", "fromDate",
"dateType", "minSize", "maxSize", "sizeType",
"sort", "order", "lang"]
query = "?q={}".format(q.replace(" ", "+"))
for param, value in params.items():
if param in available_params:
if (param != "category" and param != "status" and
param != "lang"):
query += "&{}={}".format(param, value)
elif param == "category":
query += "&c={}_{}".format(value[0], value[1])
elif param == "status":
query += "&s={}".format(value)
elif param == "lang":
for lang in value:
query += "&lang={}".format(lang)
return query