Skip to content

Commit

Permalink
Merge d0ebd0a into 4cce3a0
Browse files Browse the repository at this point in the history
  • Loading branch information
d-Rickyy-b committed Sep 7, 2019
2 parents 4cce3a0 + d0ebd0a commit d9c7063
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 34 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,5 @@ venv.bak/
.idea
venv
.vscode/*

*.db
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
![Logo](documentation/pastepwn_logo.png)
![Logo](https://raw.githubusercontent.com/d-Rickyy-b/pastepwn/master/documentation/pastepwn_logo.png)



Expand Down Expand Up @@ -27,7 +27,7 @@ To use the pastepwn framework you need to follow these simple steps:

### Behind a proxy

There is 2 ways to use this tool behind a proxy:
There are 2 ways to use this tool behind a proxy:

* Define the following environment variables: `HTTP_PROXY`, `HTTPS_PROXY`, `NO_PROXY`.
* When initializing the PastePwn object, use the `proxies` argument. `proxies` is a dict as defined in [requests' documentation](http://docs.python-requests.org/en/master/user/advanced/#proxies).
Expand Down
2 changes: 1 addition & 1 deletion pastepwn/analyzers/basicanalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ def match(self, paste):
:param paste: A :class:`pastepwn.core.paste` object which should be matched
:return: :obj:`bool` if the paste has been matched
"""
raise NotImplementedError
raise NotImplementedError("Your analyzer must implement the match method!")
6 changes: 5 additions & 1 deletion pastepwn/analyzers/wordanalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ def _blacklist_word_found(self, text):
return False

def add_word(self, word):
"""Add a word to the analyzer"""
"""
Add a word to the analyzer
:param word: Word to be added
:return:
"""
self.words.append(word)

def match(self, paste):
Expand Down
6 changes: 4 additions & 2 deletions pastepwn/scraping/pastebin/exceptions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-

from .emptybodyexception import EmptyBodyException
from .ipnotregisterederror import IPNotRegisteredError
from .pastenotreadyexception import PasteNotReadyException
from .pastedeletedexception import PasteDeletedException
from .pasteemptyexception import PasteEmptyException

__all__ = ('IPNotRegisteredError', 'EmptyBodyException')
__all__ = ('IPNotRegisteredError', 'PasteNotReadyException', 'PasteDeletedException', 'PasteEmptyException')
7 changes: 0 additions & 7 deletions pastepwn/scraping/pastebin/exceptions/emptybodyexception.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
"""Module for the PasteDeletedException"""


class PasteDeletedException(Exception):
"""Exception class indicating a paste as been deleted"""
pass
7 changes: 7 additions & 0 deletions pastepwn/scraping/pastebin/exceptions/pasteemptyexception.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
"""Module for the PasteEmptyException"""


class PasteEmptyException(Exception):
"""Exception class indicating a paste is empty"""
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
"""Module for the PasteNotReadyException"""


class PasteNotReadyException(Exception):
"""Exception class indicating a paste is not ready for downloading yet"""
pass
78 changes: 57 additions & 21 deletions pastepwn/scraping/pastebin/pastebinscraper.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-
import json
import logging
import re
import time
from queue import Queue, Empty

from pastepwn.core import Paste
from pastepwn.scraping import BasicScraper
from pastepwn.scraping.pastebin.exceptions import IPNotRegisteredError, EmptyBodyException
from pastepwn.scraping.pastebin.exceptions import IPNotRegisteredError, PasteDeletedException, PasteNotReadyException, PasteEmptyException
from pastepwn.util import Request, start_thread


Expand All @@ -17,6 +18,7 @@


class PastebinScraper(BasicScraper):
"""Scraper class for pastebin"""
name = "PastebinScraper"
api_base_url = "https://scrape.pastebin.com"

Expand All @@ -33,15 +35,25 @@ def __init__(self, paste_queue=None, exception_event=None, api_hit_rate=None):
# The hit rate describes the interval between two requests in seconds
self._api_hit_rate = api_hit_rate or 1

def _check_error(self, body):
def _check_error(self, body, key=None):
"""Checks if an error occurred and raises an exception if it did"""
if body is None:
raise EmptyBodyException()
pattern = r"YOUR IP: \d{1,3}.\d{1,3}.\d{1,3}.\d{1,3} DOES NOT HAVE ACCESS\.\s+VISIT: https:\/\/pastebin\.com\/doc_scraping_api TO GET ACCESS!"

if "DOES NOT HAVE ACCESS" in body:
if 107 >= len(body) >= 99 and re.match(pattern, body):
self._exception_event.set()
raise IPNotRegisteredError(body)

if body is None or body == "":
raise PasteEmptyException("The paste '{0}' or its body was set to None!".format(key))
if body == "File is not ready for scraping yet. Try again in 1 minute.":
# The pastebin API was not ready yet to deliver this paste -
# We raise an exception to re-download it again after some time has passed
raise PasteNotReadyException("The paste '{0}' could not be fetched yet!".format(key))
elif body == "Error, we cannot find this paste.":
# The paste has been deleted before we could download it.
# We raise an exception to delete the paste from the queue
raise PasteDeletedException("The paste '{0}' has been deleted!".format(key))

def _get_recent(self, limit=100):
"""Downloads a list of the most recent pastes - the amount is limited by the <limit> parameter"""
r = Request()
Expand Down Expand Up @@ -79,19 +91,17 @@ def _get_paste_content(self, key):
r = Request()
endpoint = "api_scrape_item.php"
api_url = "{0}/{1}?i={2}".format(self.api_base_url, endpoint, key)
content = ""

self.logger.debug("Downloading paste {0}".format(key))
try:
response_data = r.get(api_url)

self._check_error(response_data)

content = response_data
except Exception as e:
self.logger.error(e)
raise e

return content
self._check_error(response_data, key)

return response_data

def _body_downloader(self):
"""Downloads the body of pastes from pastebin, which have been put into the queue"""
Expand All @@ -109,23 +119,49 @@ def _body_downloader(self):

# if paste is not known, download the body and put it on the queue and into the list
last_body_download_time = round(time.time(), 2)
body = self._get_paste_content(paste.key)

try:
body = self._get_paste_content(paste.key)
except PasteNotReadyException:
self.logger.debug("Paste '{0}' is not ready for downloading yet. Enqueuing it again.".format(paste.key))
# Make sure to wait a certain time. If only one element in the queue, this can lead to loops
self._rate_limit_sleep(last_body_download_time)
self._tmp_paste_queue.put(paste)
continue
except PasteDeletedException:
# We don't add a sleep here, because this can't lead to loops
self.logger.info("Paste '{0}' has been deleted before we could download it! Skipping paste.".format(paste.key))
continue
except PasteEmptyException:
self.logger.info("Paste '{0}' is set to None! Skipping paste.".format(paste.key))
continue
except Exception as e:
self.logger.error("An exception occurred while downloading the paste '{0}'. Skipping this paste! Exception is: {1}".format(paste.key, e))
continue

paste.set_body(body)
self.paste_queue.put(paste)

current_time = round(time.time(), 2)
diff = round(current_time - last_body_download_time, 2)

if diff >= self._api_hit_rate:
continue

sleep_diff = round(self._api_hit_rate - diff, 3)
self.logger.debug("Sleep time is: {0}".format(sleep_diff))
time.sleep(sleep_diff)
self._rate_limit_sleep(last_body_download_time)
except Empty:
continue

def _rate_limit_sleep(self, last_body_download_time):
"""
Sleeps a certain amount of seconds to prevent hitting API rate limits
:param last_body_download_time: The time when the last paste was downloaded
:return:
"""
current_time = round(time.time(), 2)
diff = round(current_time - last_body_download_time, 2)

if diff >= self._api_hit_rate:
return

sleep_diff = round(self._api_hit_rate - diff, 3)
self.logger.debug("Sleep time is: {0}".format(sleep_diff))
time.sleep(sleep_diff)

def start(self, paste_queue):
"""Start the scraping process and download the paste metadata"""
self.paste_queue = paste_queue
Expand Down

0 comments on commit d9c7063

Please sign in to comment.