/
pastebinscraper.py
177 lines (137 loc) · 6.62 KB
/
pastebinscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
import json
import logging
import time
from queue import Queue, Empty
from pastepwn.core import Paste
from pastepwn.scraping import BasicScraper
from pastepwn.scraping.pastebin.exceptions import IPNotRegisteredError, EmptyBodyException
from pastepwn.util import Request, start_thread
# https://pastebin.com/doc_scraping_api#2
# Your whitelisted IP should not run into any issues as long as you don't abuse our service.
# We recommend not making more than 1 request per second, as there really is no need to do so.
# Going over 1 request per second won't get you blocked, but if we see excessive unnecessary scraping, we might take action.
class PastebinScraper(BasicScraper):
name = "PastebinScraper"
api_base_url = "https://scrape.pastebin.com"
def __init__(self, paste_queue=None, exception_event=None, api_hit_rate=None):
super().__init__(exception_event)
self.logger = logging.getLogger(__name__)
self._last_scrape_time = 0
self.paste_queue = paste_queue or Queue()
self._tmp_paste_queue = Queue()
self._known_pastes = []
self._known_pastes_limit = 1000
# The hit rate describes the interval between two requests in seconds
self._api_hit_rate = api_hit_rate or 1
def _check_error(self, body):
"""Checks if an error occurred and raises an exception if it did"""
if body is None:
raise EmptyBodyException()
if "DOES NOT HAVE ACCESS" in body:
self._exception_event.set()
raise IPNotRegisteredError(body)
def _get_recent(self, limit=100):
"""Downloads a list of the most recent pastes - the amount is limited by the <limit> parameter"""
r = Request()
endpoint = "api_scraping.php"
api_url = "{0}/{1}?limit={2}".format(self.api_base_url, endpoint, limit)
try:
response_data = r.get(api_url)
self._check_error(response_data)
pastes_dict = json.loads(response_data)
pastes = []
# Loop through the response and create objects by the data
for paste in pastes_dict:
paste_obj = Paste(key=paste.get("key"),
title=paste.get("title"),
user=paste.get("user"),
size=paste.get("size"),
date=paste.get("date"),
expire=paste.get("expire"),
syntax=paste.get("syntax"),
scrape_url=paste.get("scrape_url"),
full_url=paste.get("full_url"))
pastes.append(paste_obj)
return pastes
except Exception as e:
self.logger.error(e)
return None
def _get_paste_content(self, key):
"""Downloads the content of a certain paste"""
r = Request()
endpoint = "api_scrape_item.php"
api_url = "{0}/{1}?i={2}".format(self.api_base_url, endpoint, key)
content = ""
self.logger.debug("Downloading paste {0}".format(key))
try:
response_data = r.get(api_url)
self._check_error(response_data)
content = response_data
except Exception as e:
self.logger.error(e)
return content
def _body_downloader(self):
"""Downloads the body of pastes from pastebin, which have been put into the queue"""
while self.running:
try:
if self._tmp_paste_queue.qsize() > 0:
self.logger.debug("Queue size: {}".format(self._tmp_paste_queue.qsize()))
if self._stop_event.is_set() or self._exception_event.is_set():
self.running = False
break
paste = self._tmp_paste_queue.get(True, 1)
# if paste is not known, download the body and put it on the queue and into the list
last_body_download_time = round(time.time(), 2)
body = self._get_paste_content(paste.key)
paste.set_body(body)
self.paste_queue.put(paste)
current_time = round(time.time(), 2)
diff = round(current_time - last_body_download_time, 2)
if diff >= self._api_hit_rate:
continue
sleep_diff = round(self._api_hit_rate - diff, 3)
self.logger.debug("Sleep time is: {0}".format(sleep_diff))
time.sleep(sleep_diff)
except Empty:
continue
def start(self, paste_queue):
"""Start the scraping process and download the paste metadata"""
self.paste_queue = paste_queue
self.running = True
start_thread(self._body_downloader, "BodyDownloader", self._exception_event)
while self.running:
self._last_scrape_time = int(time.time())
pastes = self._get_recent(limit=100)
counter = 0
if pastes is not None:
for paste in pastes:
# check if paste is in list of known pastes
if paste.key in self._known_pastes:
# Do nothing, if it's already known
continue
self.logger.debug("Paste is unknown - adding ot to list {}".format(paste.key))
self._tmp_paste_queue.put(paste)
self._known_pastes.append(paste.key)
counter += 1
if self._stop_event.is_set() or self._exception_event.is_set():
self.running = False
break
self.logger.debug("{0} new pastes fetched!".format(counter))
# Delete some of the last pastes to not run into memory/performance issues
if len(self._known_pastes) > 1000:
self.logger.debug("known_pastes > 1000 - cleaning up!")
start_index = len(self._known_pastes) - self._known_pastes_limit
self._known_pastes = self._known_pastes[start_index:]
if self._stop_event.is_set() or self._exception_event.is_set():
self.logger.debug('stopping {0}'.format(self.name))
self.running = False
break
while self.running:
current_time = int(time.time())
diff = current_time - self._last_scrape_time
if diff > 60:
break
# if the last scraping happened less than 60 seconds ago,
# wait 2 seconds and check again
time.sleep(2)