Initial commit

clemfromspace · Feb 11, 2018 · 9cb9d40 · 9cb9d40 · bombs-kim · Aug 3, 2019
commit 9cb9d40
Show file tree

Hide file tree

Showing 16 changed files with 581 additions and 0 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,4 @@
+[run]
+omit =
+    tests/*
+    *__init__.py*
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,100 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+reports/
+.pytest_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Flask instance folder
+instance/
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+.tmpdocs/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+venv-jenkins*/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# .idea is the directory for pycharm project files
+.idea
+
+# MACOS stuff
+.DS_Store
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,42 @@
+env:
+  global:
+  - CODECLIMATE_REPO_TOKEN=a47935830d841ad61a6e960be8a3b6a5e557146ac010dafa993e61bf82898472
+
+language: python
+
+python:
+- 3.6
+
+addons:
+  firefox: "49.0.2"
+
+before_install:
+  - wget https://github.com/mozilla/geckodriver/releases/download/v0.11.1/geckodriver-v0.11.1-linux64.tar.gz
+  - mkdir geckodriver
+  - tar -xzf geckodriver-v0.11.1-linux64.tar.gz -C geckodriver
+  - export PATH=$PATH:$PWD/geckodriver
+
+install:
+- pip install -r requirements/requirements-test.txt
+
+before_script:
+  - "export DISPLAY=:99.0"
+  - "sh -e /etc/init.d/xvfb start"
+  - sleep 3
+
+script:
+- pytest --cov-config .coveragerc --cov=scrapy_selenium tests/
+- codeclimate-test-reporter
+
+deploy:
+  provider: pypi
+  user: clemfromspace
+  password:
+    secure: "TNv6olOrZXQU5uXOv2pXCHn0knCxCvyoIQJCNPVn7kwqrVhsUK+A9Tp0xBWkQugdtN30KQ9dPu7VNRfizyWvjBMXxnVmBwbjG/qUdPsa2jz4cgTNvfScBoTeESE8PkFu91xBmP9KXV0XYWVahEL6IK2klFqnFRhkpnDbeRgzSB+UUBzltb0CwIBs7r1BxI1Fcz4HkvEtoqOi/jB1GV7k2F2RIaXHNwnQ4b4Et3FzOX7y5ONUhlwtlgHfIsr3mtQkmQ0cRhzV6Sub9dwC0RckDjqRGd/cV81uWr444KK1F+XSxLU4M5+8am6zO3PDApkyYblfq54FzfbrmgrNaZ2VREVoS7SryW2cxmPPTQbBaaKAu8AZ6HIDgYzDGk54Q8W8XvK0UdAj9fPvFNHuOTJw/1HPGUcLcIDJebBSdZzg5q9hPAOv2MK+fyqfyTx5AcMJnbvitSncT5qie+OX6ZPZrXphxBv29PUPNv94f4czMk1gTvuxVyOPwP3qkyDMA2thRu/SXtE+EW/q1M9lQCXAxBU+wi+QDydxCbYs8rmF0V+dCaOdZEcEtE03l73BK8/MczX4sP3HkcoAsttkD8oXoCdo8I2nxeVqx2YlI6928ayxospLzMQQlaCy4zfAYrYyE5VqEDoS84fxJkO4aHJJDRSFJ90U0BwwLkBVRsa4t8U="
+  on:
+    tags: true
+    distributions: sdist bdist_wheel
+    repo: clemfromspace/scrapy-selenium
+
+notifications:
+  email: false
diff --git a/LICENCE b/LICENCE
@@ -0,0 +1,13 @@
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                    Version 2, December 2004
+
+ Copyright (C) 2018 Clément Denoix <clement.denoix@gmail.com>
+
+ Everyone is permitted to copy and distribute verbatim or modified
+ copies of this license document, and changing it is allowed as long
+ as the name is changed.
+
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. You just DO WHAT THE FUCK YOU WANT TO.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include requirements/requirements.txt
diff --git a/README.md b/README.md
@@ -0,0 +1,80 @@
+# Scrapy with selenium
+[![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selemnium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability)
+
+Scrapy middleware to handle javascript pages using selenium.
+
+## Installation
+```
+$ pip install scrapy-selenium
+```
+
+You will also need one of the Selenium [compatible browsers](http://www.seleniumhq.org/about/platforms.jsp).
+
+## Configuration
+1. Add the browser to use, the path to the executable, and the arguments to pass to the executable to the scrapy settings:
+    ```python
+    from shutil import which
+
+    SELENIUM_DRIVER_NAME='firefox'
+    SELENIUM_DRIVER_EXECUTABLE_PATH=which('geckodriver')
+    SELENIUM_DRIVER_ARGUMENTS=['-headless']  # '--headless' if using chrome instead of firefox
+    ```
+
+2. Add the `SeleniumMiddleware` to the downloader middlewares:
+    ```python
+    DOWNLOADER_MIDDLEWARES = {
+        'scrapy_selenium.SeleniumMiddleware': 800
+    }
+    ```
+## Usage
+Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Request` like below:
+```python
+from scrapy_selenium import SeleniumRequest
+
+yield SeleniumRequest(url, self.parse_result)
+```
+The request will be handled by selenium, and the response will have an additional `meta` key, named `driver` containing the selenium driver with the request processed.
+```python
+def parse_result(self, response):
+    print(response.meta['driver'].title)
+```
+For more information about the available driver methods and attributes, refer to the [selenium python documentation](http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver)
+
+The `selector` response attribute work as usual (but contains the html processed by the selenium driver).
+```python
+def parse_result(self, response):
+    print(response.selector.xpath('//title/@text'))
+```
+
+### Additional arguments
+The `scrapy_selenium.SeleniumRequest` accept 3 additional arguments:
+
+#### `wait_time` / `wait_until`
+
+When used, selenium will perform an [Explicit wait](http://selenium-python.readthedocs.io/waits.html#explicit-waits) before returning the response to the spider.
+```python
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+yield SeleniumRequest(
+    url,
+    self.parse_result,
+    wait_time=10,
+    wait_until=EC.element_to_be_clickable((By.ID, 'someid'))
+)
+```
+
+#### `screenshot`
+When used, selenium will take a screenshot of the page and the binary data of the .png captured will be added to the response `meta`:
+```python
+yield SeleniumRequest(
+    url,
+    self.parse_result,
+    screenshot=True
+)
+
+def parse_result(self, response):
+    with open('image.png', 'wb') as image_file:
+        image_file.write(response.meta['screenshot])
+```
+
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
@@ -0,0 +1,6 @@
+-r requirements.txt
+
+pytest==3.4.0
+coverage<4.4
+pytest-cov==2.4.0
+codeclimate-test-reporter==0.2.3
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -0,0 +1,2 @@
+scrapy>=1.0.0
+selenium>=3.9.0
diff --git a/scrapy_selenium/__init__.py b/scrapy_selenium/__init__.py
@@ -0,0 +1,2 @@
+from .http import SeleniumRequest
+from .middlewares import SeleniumMiddleware
diff --git a/scrapy_selenium/http.py b/scrapy_selenium/http.py
@@ -0,0 +1,29 @@
+"""This module contains the ``SeleniumRequest`` class"""
+
+from scrapy import Request
+
+
+class SeleniumRequest(Request):
+    """Scrapy ``Request`` subclass providing additional arguments"""
+
+    def __init__(self, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs):
+        """Initialize a new selenium request
+
+        Parameters
+        ----------
+        wait_time: int
+            The number of seconds to wait.
+        wait_until: method
+            One of the "selenium.webdriver.support.expected_conditions". The response
+            will be returned until the given condition is fulfilled.
+        screenshot: bool
+            If True, a screenshot of the page will be taken and the data of the screenshot
+            will be returned in the response "meta" attribute.
+
+        """
+
+        self.wait_time = wait_time
+        self.wait_until = wait_until
+        self.screenshot = screenshot
+
+        super().__init__(*args, **kwargs)