-
Notifications
You must be signed in to change notification settings - Fork 347
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 9cb9d40
Showing
16 changed files
with
581 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[run] | ||
omit = | ||
tests/* | ||
*__init__.py* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Created by .ignore support plugin (hsz.mobi) | ||
### Python template | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
env/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*,cover | ||
.hypothesis/ | ||
reports/ | ||
.pytest_cache | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
|
||
# Flask instance folder | ||
instance/ | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
.tmpdocs/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# IPython Notebook | ||
.ipynb_checkpoints | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# dotenv | ||
.env | ||
|
||
# virtualenv | ||
.venv | ||
venv/ | ||
venv-jenkins*/ | ||
ENV/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# .idea is the directory for pycharm project files | ||
.idea | ||
|
||
# MACOS stuff | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
env: | ||
global: | ||
- CODECLIMATE_REPO_TOKEN=a47935830d841ad61a6e960be8a3b6a5e557146ac010dafa993e61bf82898472 | ||
|
||
language: python | ||
|
||
python: | ||
- 3.6 | ||
|
||
addons: | ||
firefox: "49.0.2" | ||
|
||
before_install: | ||
- wget https://github.com/mozilla/geckodriver/releases/download/v0.11.1/geckodriver-v0.11.1-linux64.tar.gz | ||
- mkdir geckodriver | ||
- tar -xzf geckodriver-v0.11.1-linux64.tar.gz -C geckodriver | ||
- export PATH=$PATH:$PWD/geckodriver | ||
|
||
install: | ||
- pip install -r requirements/requirements-test.txt | ||
|
||
before_script: | ||
- "export DISPLAY=:99.0" | ||
- "sh -e /etc/init.d/xvfb start" | ||
- sleep 3 | ||
|
||
script: | ||
- pytest --cov-config .coveragerc --cov=scrapy_selenium tests/ | ||
- codeclimate-test-reporter | ||
|
||
deploy: | ||
provider: pypi | ||
user: clemfromspace | ||
password: | ||
secure: "TNv6olOrZXQU5uXOv2pXCHn0knCxCvyoIQJCNPVn7kwqrVhsUK+A9Tp0xBWkQugdtN30KQ9dPu7VNRfizyWvjBMXxnVmBwbjG/qUdPsa2jz4cgTNvfScBoTeESE8PkFu91xBmP9KXV0XYWVahEL6IK2klFqnFRhkpnDbeRgzSB+UUBzltb0CwIBs7r1BxI1Fcz4HkvEtoqOi/jB1GV7k2F2RIaXHNwnQ4b4Et3FzOX7y5ONUhlwtlgHfIsr3mtQkmQ0cRhzV6Sub9dwC0RckDjqRGd/cV81uWr444KK1F+XSxLU4M5+8am6zO3PDApkyYblfq54FzfbrmgrNaZ2VREVoS7SryW2cxmPPTQbBaaKAu8AZ6HIDgYzDGk54Q8W8XvK0UdAj9fPvFNHuOTJw/1HPGUcLcIDJebBSdZzg5q9hPAOv2MK+fyqfyTx5AcMJnbvitSncT5qie+OX6ZPZrXphxBv29PUPNv94f4czMk1gTvuxVyOPwP3qkyDMA2thRu/SXtE+EW/q1M9lQCXAxBU+wi+QDydxCbYs8rmF0V+dCaOdZEcEtE03l73BK8/MczX4sP3HkcoAsttkD8oXoCdo8I2nxeVqx2YlI6928ayxospLzMQQlaCy4zfAYrYyE5VqEDoS84fxJkO4aHJJDRSFJ90U0BwwLkBVRsa4t8U=" | ||
on: | ||
tags: true | ||
distributions: sdist bdist_wheel | ||
repo: clemfromspace/scrapy-selenium | ||
|
||
notifications: | ||
email: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | ||
Version 2, December 2004 | ||
|
||
Copyright (C) 2018 Clément Denoix <clement.denoix@gmail.com> | ||
|
||
Everyone is permitted to copy and distribute verbatim or modified | ||
copies of this license document, and changing it is allowed as long | ||
as the name is changed. | ||
|
||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | ||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | ||
|
||
0. You just DO WHAT THE FUCK YOU WANT TO. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include requirements/requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# Scrapy with selenium | ||
[![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selemnium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability) | ||
|
||
Scrapy middleware to handle javascript pages using selenium. | ||
|
||
## Installation | ||
``` | ||
$ pip install scrapy-selenium | ||
``` | ||
|
||
You will also need one of the Selenium [compatible browsers](http://www.seleniumhq.org/about/platforms.jsp). | ||
|
||
## Configuration | ||
1. Add the browser to use, the path to the executable, and the arguments to pass to the executable to the scrapy settings: | ||
```python | ||
from shutil import which | ||
|
||
SELENIUM_DRIVER_NAME='firefox' | ||
SELENIUM_DRIVER_EXECUTABLE_PATH=which('geckodriver') | ||
SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox | ||
``` | ||
|
||
2. Add the `SeleniumMiddleware` to the downloader middlewares: | ||
```python | ||
DOWNLOADER_MIDDLEWARES = { | ||
'scrapy_selenium.SeleniumMiddleware': 800 | ||
} | ||
``` | ||
## Usage | ||
Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Request` like below: | ||
```python | ||
from scrapy_selenium import SeleniumRequest | ||
|
||
yield SeleniumRequest(url, self.parse_result) | ||
``` | ||
The request will be handled by selenium, and the response will have an additional `meta` key, named `driver` containing the selenium driver with the request processed. | ||
```python | ||
def parse_result(self, response): | ||
print(response.meta['driver'].title) | ||
``` | ||
For more information about the available driver methods and attributes, refer to the [selenium python documentation](http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver) | ||
|
||
The `selector` response attribute work as usual (but contains the html processed by the selenium driver). | ||
```python | ||
def parse_result(self, response): | ||
print(response.selector.xpath('//title/@text')) | ||
``` | ||
|
||
### Additional arguments | ||
The `scrapy_selenium.SeleniumRequest` accept 3 additional arguments: | ||
|
||
#### `wait_time` / `wait_until` | ||
|
||
When used, selenium will perform an [Explicit wait](http://selenium-python.readthedocs.io/waits.html#explicit-waits) before returning the response to the spider. | ||
```python | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
||
yield SeleniumRequest( | ||
url, | ||
self.parse_result, | ||
wait_time=10, | ||
wait_until=EC.element_to_be_clickable((By.ID, 'someid')) | ||
) | ||
``` | ||
|
||
#### `screenshot` | ||
When used, selenium will take a screenshot of the page and the binary data of the .png captured will be added to the response `meta`: | ||
```python | ||
yield SeleniumRequest( | ||
url, | ||
self.parse_result, | ||
screenshot=True | ||
) | ||
|
||
def parse_result(self, response): | ||
with open('image.png', 'wb') as image_file: | ||
image_file.write(response.meta['screenshot]) | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
-r requirements.txt | ||
|
||
pytest==3.4.0 | ||
coverage<4.4 | ||
pytest-cov==2.4.0 | ||
codeclimate-test-reporter==0.2.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
scrapy>=1.0.0 | ||
selenium>=3.9.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .http import SeleniumRequest | ||
from .middlewares import SeleniumMiddleware |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
"""This module contains the ``SeleniumRequest`` class""" | ||
|
||
from scrapy import Request | ||
|
||
|
||
class SeleniumRequest(Request): | ||
"""Scrapy ``Request`` subclass providing additional arguments""" | ||
|
||
def __init__(self, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs): | ||
"""Initialize a new selenium request | ||
Parameters | ||
---------- | ||
wait_time: int | ||
The number of seconds to wait. | ||
wait_until: method | ||
One of the "selenium.webdriver.support.expected_conditions". The response | ||
will be returned until the given condition is fulfilled. | ||
screenshot: bool | ||
If True, a screenshot of the page will be taken and the data of the screenshot | ||
will be returned in the response "meta" attribute. | ||
""" | ||
|
||
self.wait_time = wait_time | ||
self.wait_until = wait_until | ||
self.screenshot = screenshot | ||
|
||
super().__init__(*args, **kwargs) |
Oops, something went wrong.
9cb9d40
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in process_request method in scrapy_selenium/middlewares.py,
why do you
driver.add_cookie()
afterdriver.get()
? It seems it should be the opposite.