updated package name, README.md and patch version tag

deshrit · Jun 6, 2023 · 29c617c · 29c617c
1 parent 5df8708
commit 29c617c
Show file tree

Hide file tree

Showing 16 changed files with 707 additions and 0 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,29 @@
+name: Tests
+on:
+  push:
+    branches:
+    - main
+
+jobs:
+  tests:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: 
+        os: [ubuntu-latest, windows-latest]
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools
+        pip install tox tox-gh-actions
+    
+    - name: Runnign tests with tox
+      run: tox
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,22 @@
+.env/
+env/
+.venv/
+venv/
+
+.python-version
+
+*/__pycache__
+*.pyc
+
+.vscode
+
+build/
+dist/
+*.egg-info/
+
+
+.tox
+htmlcov/
+.coverage
+.mypy_cache
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,15 @@
+repos:
+- repo: https://github.com/psf/black
+  rev: 23.3.0
+  hooks:
+  - id: black
+
+- repo: https://github.com/PyCQA/flake8 
+  rev: 6.0.0
+  hooks:
+  - id: flake8
+
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v1.3.0  # Use the sha / tag you want to point at
+  hooks:
+  - id: mypy
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include *.md LISCENSE
diff --git a/README.md b/README.md
@@ -1,2 +1,88 @@
 # just-another-imgscrapper
+![](https://github.com/deshrit/just-another-imgscrapper/actions/workflows/tests.yml/badge.svg)
+
 A utility for scrapping images from a HTML doc.
+
+Uses `asyncio` for fast concurrent download.
+
+## Installation
+```bash
+$ pip install just-another-imgscrapper
+```
+## Usage
+### 1. From cli
+```bash
+$ imgscrapper -h
+```
+To get HTML doc, extract image links from `src` attribute of `<img>` tags and download.
+```
+$ imgscrapper "http://foo.com/bar"
+[2023-06-06 23:22:56] imgscrapper.utils:INFO: ### Initializing Scrapping ###
+[2023-06-06 23:23:01] imgscrapper.utils:INFO: ### Downloaded 41 images out of extracted 41 links ###
+```
+Downloads to `imgs/` dir in working dir. If dir does not exists, creates.
+
+### 2. From module
+```python
+>>> from imgscrapper import ImgScrapper
+>>> d = ImgScrapper()
+>>> d.download("http://foo.com/bar") 
+>>> 3
+```
+Specify path to store downloaded images.
+```python
+>>> d = ImgScrapper()
+>>> d.url = "http://foo.com/bar"
+>>> d.path = "/path/download"
+>>> d.download() # returns no. of successful downloads
+>>> 3
+```
+Some servers will block the scrapping, respect robots.txt and only used in allowed hosts.
+
+You can add request headers.
+```python
+>>> ...
+>>> d.request_header = {
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0',
+    'DNT': '1',
+    }
+>>> ...
+```
+You can specifically select specific type of `img` tags by specfying attribute of HTML element.
+```html
+<!-- >http://helloworld.com<-->
+<html>
+    <body>
+        <img src="https://foo.com/bar.png" class="apple ball">
+        <img src="/foo.jpg" class="cat bar">
+    </body>
+<html>
+```
+To select only images with `class: cat`
+```python
+>>> d = ImgScrapper()
+>>> d.url = "http://helloworld.com"
+>>> d.attrs = {
+    'class': 'cat',
+    }
+>>> d.download()
+>>> 1 # http://helloworld.com/foo.jpg
+```
+The downloader gives unique `uuid` to downloaded images preserving the image extension.
+```python
+>>> d = ImgScrapper(
+    url = "http://helloworld.com",
+    attrs = {'class': 'cat'},
+    max = 5,
+    path = "/home/images"
+)
+>>> d.download()
+>>> 5
+```
+You can limit no. of image downloads by `max` value.
+
+## Liscense
+`just-another-imgscrapper` is released under the MIT liscense. See LISCENSE for details.
+
+## Contact
+Follow me on twitter [@deshritbaral](!https://twitter.com/deshritbaral)
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,67 @@
+[metadata]
+name = just-another-imgscrapper
+version = 0.1.0
+description = A utility for scrapping images from a HTML doc from  a URL.
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/deshrit/just-another-imgscrapper
+author = Deshrit Baral
+author_email = deshritbaral@gmail.com
+license = MIT
+license_files = LICENSE
+keywords = image, scrapper, asyncio, httpx, beautifulsoup4, lxml
+classifiers =
+    Programming Language :: Python
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+
+[options]
+package_dir =
+    = src
+packages = find:
+install_requires = 
+    aiofiles >= 23.1.0
+    beautifulsoup4 >= 4.12.2
+    httpx >= 0.24.1
+    lxml >= 4.9.2
+python_requires = >=3.7
+
+[options.extras_require]
+dev = 
+    coverage
+    flake8
+    mypy
+    pre-commit
+    tox
+
+[options.packages.find]
+where = src
+
+[options.entry_points]
+console_scripts =
+    imgscrapper = imgscrapper.__main__:main
+
+[flake8]
+max-line-length = 88
+per-file-ignores = 
+    */__init__.py: F401
+
+[mypy]
+warn_return_any = True
+warn_unused_configs = True
+
+[mypy-aiofiles]
+ignore_missing_imports = True
+
+[mypy-bs4]
+ignore_missing_imports = True
+
+[mypy-httpx]
+ignore_missing_imports = True
+
+[mypy-lxml]
+ignore_missing_imports = True
diff --git a/setup.py b/setup.py
@@ -0,0 +1,4 @@
+from setuptools import setup
+
+if __name__ == "__main__":
+    setup()
diff --git a/src/imgscrapper/__init__.py b/src/imgscrapper/__init__.py
@@ -0,0 +1,12 @@
+# __init__.py
+
+__version__ = "0.1.1"
+
+from .imgscrapper import ImgScrapper
+
+from .errors import (
+    ImgScrapperError,
+    InvalidURLError,
+    HTMLDocFetchError,
+    NoImgTagError,
+)
diff --git a/src/imgscrapper/__main__.py b/src/imgscrapper/__main__.py
@@ -0,0 +1,85 @@
+# __main__.py
+
+import sys
+import argparse
+
+from typing import Optional
+from typing import Sequence
+
+from .utils import get_logger
+import imgscrapper
+
+
+# logger
+logger = get_logger()
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    """This function runs when module is envoked from the cli."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "url",
+        help="URL link to scrape images from "
+        '(provide in quotes "" to avoid shell '
+        "special keywords conflict)",
+    )
+    parser.add_argument(
+        "-c",
+        metavar="CLASS",
+        action="append",
+        default=[],
+        help="HTML class attribute - chain to add multiple classes",
+    )
+    parser.add_argument(
+        "-a",
+        "--attr",
+        action="append",
+        default=[],
+        help="Used as `imgscrapper -a 'id' -a 'test' {url}` selects "
+        "<img src='foo.jpg' id='test'> ,can be further chained for "
+        "more but first is the unique attribute",
+    )
+    parser.add_argument("-p", "--path", help="Image download directory")
+    parser.add_argument(
+        "-m", "--max", type=int, help="Maximum number of images to download"
+    )
+    args = parser.parse_args(argv)
+
+    try:
+        scrapper = imgscrapper.ImgScrapper()
+        scrapper.url = args.url
+
+        attrs = {}
+        if args.c:
+            attrs["class"] = " ".join(args.c)
+            scrapper.attrs = attrs
+
+        if args.attr:
+            key = args.attr[0]
+            value = " ".join(args.attr[1:])
+            attrs[key] = value
+            scrapper.attrs = attrs
+
+        if args.path:
+            scrapper.path = args.path
+
+        if args.max:
+            scrapper.max = args.max
+
+        logger.info("### Initializing Scrapping ###")
+        count = scrapper.download()
+        logger.info(
+            f"### Downloaded {count} images out of extracted "
+            f"{len(scrapper.img_urls)} links ###"
+        )
+
+    except Exception as e:
+        logger.error(e)
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/imgscrapper/errors.py b/src/imgscrapper/errors.py
@@ -0,0 +1,24 @@
+# errors.py
+
+"""
+Exception classes for `imgscrapper` module.
+"""
+
+
+class ImgScrapperError(Exception):
+    """Base Exception class for Image Scrapper module"""
+
+
+class InvalidURLError(ImgScrapperError):
+    """Exception class for invalid URL of any kind"""
+
+
+class HTMLDocFetchError(ImgScrapperError):
+    """Error fetching HTML response from the URL"""
+
+
+class NoImgTagError(ImgScrapperError):
+    """
+    Exception if no image tags found with given search condition
+    in the downloaded html markup from the URL.
+    """