From 59dbad171e1df5c5d68a8699c8334112b288572f Mon Sep 17 00:00:00 2001 From: rex <1073853456@qq.com> Date: Mon, 18 Mar 2024 09:01:47 +0800 Subject: [PATCH 1/2] files generate by cookiecutter --- .gitignore | 106 ++++++++++++++++++ AUTHORS.md | 9 ++ LICENSE | 22 ++++ README-EN.md | 6 + hf_mirror_fetch/__init__.py | 5 + hf_mirror_fetch/cli.py | 16 +++ .../mirror_download.py | 0 hfmf.py | 60 ++++++++++ requirements_dev.txt | 6 + setup.cfg | 20 ++++ setup.py | 49 ++++++++ tests/__init__.py | 1 + tests/test_hf_mirror_fetch.py | 36 ++++++ 13 files changed, 336 insertions(+) create mode 100644 .gitignore create mode 100644 AUTHORS.md create mode 100644 LICENSE create mode 100644 README-EN.md create mode 100644 hf_mirror_fetch/__init__.py create mode 100644 hf_mirror_fetch/cli.py rename mirror_download.py => hf_mirror_fetch/mirror_download.py (100%) create mode 100644 hfmf.py create mode 100644 requirements_dev.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/test_hf_mirror_fetch.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c915d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,106 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# IDE settings +.vscode/ +.idea/ diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000..92085c3 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,9 @@ +# Credits + +## Development Lead + +- Qing + +## Contributors + +- Rex Wang <1073853456@qq.com> diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ba48b5c --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2024, Qing + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/README-EN.md b/README-EN.md new file mode 100644 index 0000000..08d9422 --- /dev/null +++ b/README-EN.md @@ -0,0 +1,6 @@ +# hf-mirror-download +A command-line tool designed to streamline the process of downloading machine learning models and related files from the Hugging Face model hub mirror site. + +https://hf-mirror.com/ + + diff --git a/hf_mirror_fetch/__init__.py b/hf_mirror_fetch/__init__.py new file mode 100644 index 0000000..d7012dd --- /dev/null +++ b/hf_mirror_fetch/__init__.py @@ -0,0 +1,5 @@ +"""Top-level package for hf-mirror-fetch.""" + +__author__ = """Qing""" +__email__ = 'aqsz2526@outlook.com' +__version__ = '0.1.0' diff --git a/hf_mirror_fetch/cli.py b/hf_mirror_fetch/cli.py new file mode 100644 index 0000000..98cff60 --- /dev/null +++ b/hf_mirror_fetch/cli.py @@ -0,0 +1,16 @@ +"""Console script for hf_mirror_fetch.""" +import sys +import click + + +@click.command() +def main(args=None): + """Console script for hf_mirror_fetch.""" + click.echo("Replace this message by putting your code into " + "hf_mirror_fetch.cli.main") + click.echo("See click documentation at https://click.palletsprojects.com/") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) # pragma: no cover diff --git a/mirror_download.py b/hf_mirror_fetch/mirror_download.py similarity index 100% rename from mirror_download.py rename to hf_mirror_fetch/mirror_download.py diff --git a/hfmf.py b/hfmf.py new file mode 100644 index 0000000..07a6694 --- /dev/null +++ b/hfmf.py @@ -0,0 +1,60 @@ +"""Main module.""" +import os +import json +import requests +import click +from tqdm import tqdm +from bs4 import BeautifulSoup +from urllib.parse import unquote, quote, quote_plus + +ROOT = "https://hf-mirror.com" + +def get_next_page_items(soup, url): + obj = soup.find_all('div', attrs={'data-target': "ViewerIndexTreeList"}) + data_props = json.loads(obj[0]['data-props']) + current_items = data_props['entries'] + next_page_url = data_props['nextURL'] + if next_page_url is not None: + data = requests.get(f"{ROOT}{next_page_url}").json() + all_items = current_items + data + else: + all_items = current_items + + download_url = url.replace('tree/main', 'resolve/main') + url2names = [(f"{download_url}/{item['path']}?download=true", item['path']) for item in all_items if item['type'] == 'file'] + return url2names + +def get_url2names(url): + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + a_tags = soup.find_all(title="Download file") + url2names = [(ROOT + a_tag['href'], unquote(a_tag['href'].replace('?download=true', '').split('resolve/main/')[-1])) for a_tag in a_tags] + all_url2names = get_next_page_items(soup, url) + for item in url2names: + assert item in all_url2names + return all_url2names + +def save_with_wget(url, file): + os.system(f"wget -c {url} -O {file}") + +@click.command() +@click.argument('url') +@click.option('--tgt_folder', default=None, help='Target folder to save the files. Defaults to model name.') +@click.option('--update', is_flag=True, help='Update existing files except weights.') +def download_from_mirror_page(url, tgt_folder, update): + if not url.startswith(ROOT): + assert url.startswith("https://huggingface.co"), "URL must be from hf-mirror.com or huggingface.co" + url = url.replace("huggingface.co", "hf-mirror.com") + url2names = get_url2names(url) + if tgt_folder is None: + tgt_folder = os.path.join(".", url.replace(ROOT + "/", '').replace('/tree/main', '')) + os.makedirs(tgt_folder, exist_ok=True) + click.echo(f'Saving to {tgt_folder}\n') + for url, name in url2names: + if update and os.path.exists(os.path.join(tgt_folder, name)): + click.echo(f"Skipping {name} as it exists") + continue + save_with_wget(url, os.path.join(tgt_folder, name)) + +if __name__ == '__main__': + download_from_mirror_page() diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..8b3c4ec --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,6 @@ +pip==19.2.3 +wheel==0.33.6 +coverage==4.5.4 +Click==7.1.2 +pytest==6.2.4 + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..1fa1592 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,20 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = True + +[bumpversion:file:setup.py] +search = version='{current_version}' +replace = version='{new_version}' + +[bumpversion:file:hf_mirror_fetch/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' + +[bdist_wheel] +universal = 1 + +[flake8] +exclude = docs +[tool:pytest] +addopts = --ignore=setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b3fb872 --- /dev/null +++ b/setup.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +"""The setup script.""" + +from setuptools import setup, find_packages + +with open('README.rst') as readme_file: + readme = readme_file.read() + +with open('HISTORY.rst') as history_file: + history = history_file.read() + +requirements = ['Click>=7.0', ] + +test_requirements = ['pytest>=3', ] + +setup( + author="Qing", + author_email='aqsz2526@outlook.com', + python_requires='>=3.6', + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + ], + description="A command-line tool designed to streamline the process of downloading machine learning models and related files from the Hugging Face model hub mirror site.", + entry_points={ + 'console_scripts': [ + 'hf_mirror_fetch=hf_mirror_fetch.cli:main', + ], + }, + install_requires=requirements, + license="MIT license", + long_description=readme + '\n\n' + history, + include_package_data=True, + keywords='hf_mirror_fetch', + name='hf_mirror_fetch', + packages=find_packages(include=['hf_mirror_fetch', 'hf_mirror_fetch.*']), + test_suite='tests', + tests_require=test_requirements, + url='https://github.com/Qing25/hf_mirror_fetch', + version='0.1.0', + zip_safe=False, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e71e025 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Unit test package for hf_mirror_fetch.""" diff --git a/tests/test_hf_mirror_fetch.py b/tests/test_hf_mirror_fetch.py new file mode 100644 index 0000000..5157b9b --- /dev/null +++ b/tests/test_hf_mirror_fetch.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +"""Tests for `hf_mirror_fetch` package.""" + +import pytest + +from click.testing import CliRunner + +from hf_mirror_fetch import cli + + +@pytest.fixture +def response(): + """Sample pytest fixture. + + See more at: http://doc.pytest.org/en/latest/fixture.html + """ + # import requests + # return requests.get('https://github.com/audreyr/cookiecutter-pypackage') + + +def test_content(response): + """Sample pytest test function with the pytest fixture as an argument.""" + # from bs4 import BeautifulSoup + # assert 'GitHub' in BeautifulSoup(response.content).title.string + + +def test_command_line_interface(): + """Test the CLI.""" + runner = CliRunner() + result = runner.invoke(cli.main) + assert result.exit_code == 0 + assert 'hf_mirror_fetch.cli.main' in result.output + help_result = runner.invoke(cli.main, ['--help']) + assert help_result.exit_code == 0 + assert '--help Show this message and exit.' in help_result.output From 859636861333b3dfa0839a5b38d2791651e0ea4b Mon Sep 17 00:00:00 2001 From: rex <1073853456@qq.com> Date: Mon, 18 Mar 2024 09:36:40 +0800 Subject: [PATCH 2/2] add tests and workflows --- .github/ISSUE_TEMPLATE.md | 15 +++++++++ .github/workflows/publish.yml | 31 ++++++++++++++++++ .github/workflows/test.yml | 45 ++++++++++++++++++++++++++ hfmf.py | 60 ----------------------------------- requirements_dev.txt | 10 ++++-- setup.py | 11 +++---- 6 files changed, 103 insertions(+), 69 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE.md create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/test.yml delete mode 100644 hfmf.py diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..06be26b --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,15 @@ +* hf-mirror-fetch version: +* Python version: +* Operating System: + +### Description + +Describe what you were trying to get done. +Tell us what happened, what went wrong, and what you expected to happen. + +### What I Did + +``` +Paste the command(s) you ran and the output. +If there was a crash, please include the traceback here. +``` diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..e0ad872 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,31 @@ +name: Publish Package + +on: + push: + tags: + - '*' + +jobs: + build-and-publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Verify tag matches setup.py version + run: | + if [[ "$(git describe --tags)" != "$(grep "^VERSION" setup.py | cut -d"'" -f2)" ]]; then echo "Tag does not match setup.py version"; exit 0; fi + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine check dist/* + twine upload dist/* --skip-existing diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..66e21fe --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,45 @@ +name: Python Package + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - python-version: "3.9" + os: ubuntu-latest + - python-version: "3.9" + os: macos-latest + # - python-version: "3.9" + # os: window + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[test] + python -m pip install -r requirements_dev.txt + - name: Test with pytest and coverage + run: | + pip install coverage + coverage run -m pytest tests/ + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + flags: unittests + name: codecov-umbrella + fail_ci_if_error: false diff --git a/hfmf.py b/hfmf.py deleted file mode 100644 index 07a6694..0000000 --- a/hfmf.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Main module.""" -import os -import json -import requests -import click -from tqdm import tqdm -from bs4 import BeautifulSoup -from urllib.parse import unquote, quote, quote_plus - -ROOT = "https://hf-mirror.com" - -def get_next_page_items(soup, url): - obj = soup.find_all('div', attrs={'data-target': "ViewerIndexTreeList"}) - data_props = json.loads(obj[0]['data-props']) - current_items = data_props['entries'] - next_page_url = data_props['nextURL'] - if next_page_url is not None: - data = requests.get(f"{ROOT}{next_page_url}").json() - all_items = current_items + data - else: - all_items = current_items - - download_url = url.replace('tree/main', 'resolve/main') - url2names = [(f"{download_url}/{item['path']}?download=true", item['path']) for item in all_items if item['type'] == 'file'] - return url2names - -def get_url2names(url): - response = requests.get(url) - soup = BeautifulSoup(response.text, 'html.parser') - a_tags = soup.find_all(title="Download file") - url2names = [(ROOT + a_tag['href'], unquote(a_tag['href'].replace('?download=true', '').split('resolve/main/')[-1])) for a_tag in a_tags] - all_url2names = get_next_page_items(soup, url) - for item in url2names: - assert item in all_url2names - return all_url2names - -def save_with_wget(url, file): - os.system(f"wget -c {url} -O {file}") - -@click.command() -@click.argument('url') -@click.option('--tgt_folder', default=None, help='Target folder to save the files. Defaults to model name.') -@click.option('--update', is_flag=True, help='Update existing files except weights.') -def download_from_mirror_page(url, tgt_folder, update): - if not url.startswith(ROOT): - assert url.startswith("https://huggingface.co"), "URL must be from hf-mirror.com or huggingface.co" - url = url.replace("huggingface.co", "hf-mirror.com") - url2names = get_url2names(url) - if tgt_folder is None: - tgt_folder = os.path.join(".", url.replace(ROOT + "/", '').replace('/tree/main', '')) - os.makedirs(tgt_folder, exist_ok=True) - click.echo(f'Saving to {tgt_folder}\n') - for url, name in url2names: - if update and os.path.exists(os.path.join(tgt_folder, name)): - click.echo(f"Skipping {name} as it exists") - continue - save_with_wget(url, os.path.join(tgt_folder, name)) - -if __name__ == '__main__': - download_from_mirror_page() diff --git a/requirements_dev.txt b/requirements_dev.txt index 8b3c4ec..9afe89c 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,6 +1,10 @@ pip==19.2.3 +bump2version==0.5.11 wheel==0.33.6 +watchdog==0.9.0 +flake8==3.7.8 +tox==3.14.0 coverage==4.5.4 -Click==7.1.2 -pytest==6.2.4 - +Sphinx==1.8.5 +twine==1.14.0 +pytest==6.2.4 \ No newline at end of file diff --git a/setup.py b/setup.py index b3fb872..8615201 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,10 @@ from setuptools import setup, find_packages -with open('README.rst') as readme_file: - readme = readme_file.read() +VERSION = "0.1.0" -with open('HISTORY.rst') as history_file: - history = history_file.read() +with open('README.md') as readme_file: + readme = readme_file.read() requirements = ['Click>=7.0', ] @@ -36,7 +35,7 @@ }, install_requires=requirements, license="MIT license", - long_description=readme + '\n\n' + history, + long_description=readme, include_package_data=True, keywords='hf_mirror_fetch', name='hf_mirror_fetch', @@ -44,6 +43,6 @@ test_suite='tests', tests_require=test_requirements, url='https://github.com/Qing25/hf_mirror_fetch', - version='0.1.0', + version=VERSION, zip_safe=False, )