Skip to content

Commit

Permalink
Reorganised code to allow for easy use with scrapy shell.
Browse files Browse the repository at this point in the history
  • Loading branch information
further-reading committed Jan 6, 2020
1 parent 8385f56 commit 2772826
Show file tree
Hide file tree
Showing 14 changed files with 60 additions and 35 deletions.
26 changes: 24 additions & 2 deletions README.md
Expand Up @@ -30,7 +30,7 @@ It returns results as though `selection.css('YOUR QUERY').re(r'YOUR REGEX')'` wa
### Function Box
This box lets you define additional python code that can run on the results of your query and regex. The code can be as long and complex as you want, including adding additional functions, classes, imports etc.

The only requirement is you must include a function called `user_fun(results)` that returns a `list`.
The only requirement is you must include a function called `user_fun(results, selector)` that returns a `list`.

### Results Box

Expand All @@ -44,4 +44,26 @@ This tab contains the html source that is used in the Tools tab. You can use the

## Notes Tab

This is just a plain text box. Content in here us not saved when you exit the app.
This is just a plain text box. Content in here is not saved when you exit the app.

# Integration with Scrapy Shell

It is possible to integrate this tool with the scrapy shell. This will allow you to use it on responses that have been passed through your middlewares, access more complex requests and more specific selectors.

# Installation

The quickest way to integrate it is to follow these steps:

1. Copy the `utils_ui` folder into your project directory.
2. Install the requirements in `requirements-shell.txt`

# Activation

To use it in your shell:

1. Import it into your shell with `from YOUR_PROJECT_DIRECTORY.utils_ui import scrapy_tools`
2. Use the `scrapy_tools.load_selector` function to open a window with a selector loaded in.

> For example `scrapy_tools.load_selector(response)` will load your response into the UI.
When you run the code a window named `Shell UI` will open that contains the `Tools`, `Source` and `Notes` tabs from the standalone window mentioned above.
Empty file added code/__init__.py
Empty file.
Empty file added code/browser_window/__init__.py
Empty file.
8 changes: 4 additions & 4 deletions code/utils_ui/browser.py → code/browser_window/browser.py
Expand Up @@ -32,17 +32,17 @@ def init_ui(self):
self.web.loadFinished.connect(self.load_finished)
self.web.load(QUrl(HOME))

back_button = BrowserButton(image=r'utils_ui/images/back.png')
back_button = BrowserButton(image=r'browser_window/images/back.png')
back_button.clicked.connect(self.web.back)
grid.addWidget(back_button, 0, 0)

forward_button = BrowserButton(image=r'utils_ui/images/forward.png')
forward_button = BrowserButton(image=r'browser_window/images/forward.png')
forward_button.clicked.connect(self.web.forward)
grid.addWidget(forward_button, 0, 1)

self.movie = MovieScreen(
movie_file=r'utils_ui/images/loader.gif',
end_file=r'utils_ui/images/empty.png',
movie_file=r'browser_window/images/loader.gif',
end_file=r'browser_window/images/empty.png',
)
self.movie.setMaximumHeight(20)
self.movie.setMaximumWidth(20)
Expand Down
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
6 changes: 4 additions & 2 deletions code/main.py
@@ -1,10 +1,11 @@
from PyQt5.QtWidgets import *

from bs4 import BeautifulSoup
from parsel import Selector
import requests

from utils_ui.text_processor import EnhancedTextViewer
from utils_ui.browser import QtBrowser
from browser_window.browser import QtBrowser
from utils_ui.scrapy_tools import Queries
import sys

Expand Down Expand Up @@ -34,7 +35,8 @@ def update_source(self, url):
# in future = look for way to grab initial html response from pyqt5
response = requests.get(url)
html = response.text
self.queries.update_source(html)
selector = Selector(text=html)
self.queries.update_source(selector)
soup = BeautifulSoup(html, 'html.parser')
html_out = soup.prettify()
self.source.setPlainText(html_out)
Expand Down
Empty file added code/utils_ui/__init__.py
Empty file.
19 changes: 9 additions & 10 deletions code/utils_ui/parser.py
@@ -1,15 +1,14 @@
from parsel import Selector
from cssselect.xpath import ExpressionError
from cssselect.parser import SelectorSyntaxError
import traceback
from utils_ui import errors
from . import errors


class Parser:
def __init__(self, html):
self.selector = Selector(text=html)
def __init__(self, selector):
self.selector = selector

def do_query(self, css, regex=None, function=None):
def do_query(self, css, selector, regex=None, function=None):
try:
results = self.selector.css(css)
except (ExpressionError, SelectorSyntaxError) as e:
Expand Down Expand Up @@ -46,7 +45,7 @@ def do_query(self, css, regex=None, function=None):
results = results.getall()

if function:
results = self.use_custom_function(results, function)
results = self.use_custom_function(results, function, selector)
if not results:
raise errors.QueryError(
title='Function Empty',
Expand All @@ -55,9 +54,9 @@ def do_query(self, css, regex=None, function=None):
)
return results

def use_custom_function(self, results, function):
if 'def user_fun(results):' not in function:
message = f'Custom function needs to be named "user_fun" and have "results" as argument'
def use_custom_function(self, results, function, selector):
if 'def user_fun(results, selector):' not in function:
message = f'Custom function needs to be named "user_fun" and have "results" and "selector" as arguments'
raise errors.QueryError(
title='Function Error',
message=message,
Expand All @@ -66,7 +65,7 @@ def use_custom_function(self, results, function):

try:
exec(function, globals())
results = user_fun(results)
results = user_fun(results, selector)
except Exception as e:
message = f'Error running custom function\n\n{type(e).__name__}: {e.args}'
message += f'\n\n{traceback.format_exc()}'
Expand Down
26 changes: 13 additions & 13 deletions code/utils_ui/scrapy_tools.py
@@ -1,10 +1,10 @@
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from utils_ui.text_processor import EnhancedTextViewer
from bs4 import BeautifulSoup

from utils_ui.parser import Parser
from utils_ui import errors
from .text_processor import EnhancedTextViewer
from .parser import Parser
from . import errors

import sys

Expand Down Expand Up @@ -60,9 +60,9 @@ def initUI(self):
self.function_section.query.setPlainText(
"""# import packages
# must have 'user_fun' function with 'results' as argument and return a list
# must have 'user_fun' function with\n'results' and 'selector' as arguments\nand return a list
def user_fun(results):
def user_fun(results, selector):
# your code
return results"""
)
Expand Down Expand Up @@ -196,27 +196,27 @@ def __init__(self, *args, **kwargs):
self.init_ui()

def init_ui(self):
self.setWindowTitle('Browser')
self.setWindowTitle('Shell UI')
tabs = QTabWidget()
self.queries = Queries(main=self)
self.source = EnhancedTextViewer()
self.notes = QPlainTextEdit()
tabs.addTab(self.browser, 'Browser')
tabs.addTab(self.queries, 'Tools')
tabs.addTab(self.source, 'Source')
tabs.addTab(self.notes, 'Notes')
self.setCentralWidget(tabs)

def add_response(self, html):
self.queries.update_source(html)
soup = BeautifulSoup(html, 'html.parser')
def add_selector(self, response):
self.queries.update_source(response)
soup = BeautifulSoup(response.text, 'html.parser')
html_out = soup.prettify()
self.source.setPlainText(html_out)


def load_response(response):
def load_selector(selector):
print('Shell UI window opened - Close window to regain use of shell')
app = QApplication(sys.argv)
main = MiniUI()
main.add_response(response.text)
main.add_selector(selector)
main.show()
sys.exit(app.exec_())
app.exec_()
4 changes: 4 additions & 0 deletions requirements-shell.txt
@@ -0,0 +1,4 @@
parsel==1.5.2
cssselect==1.1.0
beautifulsoup4==4.8.2
PyQt5==5.14.0
6 changes: 2 additions & 4 deletions requirements.txt
@@ -1,5 +1,3 @@
parsel==1.5.2
cssselect==1.1.0
-r requirements-shell.txt
requests==2.22.0
beautifulsoup4==4.8.2
PyQt5==5.14.0
PyQtWebEngine-5.14.0

0 comments on commit 2772826

Please sign in to comment.