From 98cd395ab385324259ede3a0d573055558f9fc72 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 25 May 2023 21:56:41 -0400 Subject: [PATCH 1/3] BF: use pygithub directly using hub.oauthtoken credential datalad core was refactored heavily and it just makes it easier to use pygithub directly here. To not mess with credentials - just getting the one from config --- datalad_crawler/pipelines/gh.py | 12 ++++++------ datalad_crawler/pipelines/tests/test_gh.py | 17 ++++++++--------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/datalad_crawler/pipelines/gh.py b/datalad_crawler/pipelines/gh.py index d36183b..d47dd1e 100644 --- a/datalad_crawler/pipelines/gh.py +++ b/datalad_crawler/pipelines/gh.py @@ -14,6 +14,8 @@ import re +from datalad import cfg + from datalad.api import Dataset, install from datalad.support import path as op from datalad.support.gitrepo import GitRepo @@ -71,8 +73,6 @@ def pipeline(org=None, drop_data = assure_bool(drop_data) import github as gh - # TODO: consider elevating that function to a "public" helper - from datalad.support.github_ import _gen_github_entity superds = Dataset('.') if metadata_nativetypes: metadata_nativetypes = assure_list_from_str(metadata_nativetypes, sep=',') @@ -80,10 +80,10 @@ def pipeline(org=None, aggregate_later = [] def crawl_github_org(data): assert list(data) == ['datalad_stats'], data - # TODO: actually populate the datalad_stats with # of datasets and - # possibly amount of data downloaded in get below - # Needs DataLad >= 0.13.6~7^2~3 where password was removed - entity, cred = next(_gen_github_entity(None, org)) + + # TODO: redo with proper integration + g = gh.Github(cfg.obtain('hub.oauthtoken')) + entity = g.get_organization(org) all_repos = list(entity.get_repos(repo_type)) for repo in all_repos: diff --git a/datalad_crawler/pipelines/tests/test_gh.py b/datalad_crawler/pipelines/tests/test_gh.py index 6c779f3..4630cea 100644 --- a/datalad_crawler/pipelines/tests/test_gh.py +++ b/datalad_crawler/pipelines/tests/test_gh.py @@ -1,19 +1,15 @@ from datalad.utils import chpwd +from datalad import cfg from datalad.api import ( crawl, crawl_init, create, ) try: - from datalad.support.github_ import _get_github_cred + import github except ImportError: - # might be dated which has not merged - # https://github.com/datalad/datalad/pull/4400 yet - from datalad.downloaders.credentials import UserPassword - def _get_github_cred(): - """Trimmed down helper""" - return UserPassword("github", "does not matter") + github = None from datalad.tests.utils_pytest import ( assert_false, @@ -27,8 +23,11 @@ def _get_github_cred(): @skip_if_no_network @with_tempfile def test_crawl(tempd=None): - if not _get_github_cred().is_known: - pytest.skip("no github credential") + if not github: + pytest.skip("no github package") + # ATM tests completely overload HOME so TODO: do make it use credentials system + if not cfg.get('hub.oauthtoken'): + pytest.skip("no github credentials") ds = create(tempd) with chpwd(tempd): crawl_init( From 9b762fc4e2611d8400f0f053be0b97671ef3d9c4 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 25 May 2023 22:09:29 -0400 Subject: [PATCH 2/3] Use "default" github token if stored --- datalad_crawler/pipelines/gh.py | 16 +++++++++++++++- datalad_crawler/pipelines/tests/test_gh.py | 5 +++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/datalad_crawler/pipelines/gh.py b/datalad_crawler/pipelines/gh.py index d47dd1e..8b49b7b 100644 --- a/datalad_crawler/pipelines/gh.py +++ b/datalad_crawler/pipelines/gh.py @@ -17,6 +17,7 @@ from datalad import cfg from datalad.api import Dataset, install +from datalad.downloaders.credentials import Token from datalad.support import path as op from datalad.support.gitrepo import GitRepo from datalad.utils import ( @@ -33,6 +34,19 @@ lgr = getLogger("datalad.crawler.pipelines.github") +def _get_github_token(obtain=False) -> str: + # Quick and dirty adapter which would use stored Token if was stored in credentials + # or just access in cfg if present + try: + token = Token('api.github.com')()['token'] + if not token: + raise ValueError("Empty value for token is stored") + return token + except Exception as exc: + lgr.warning("Failed to get api.github.com credential: %s", exc) + return (cfg.obtain if obtain else cfg.get)('hub.oauthtoken') + + def pipeline(org=None, repo_type='sources', include='.*', exclude=None, metadata_nativetypes=None, aggregate=False, @@ -82,7 +96,7 @@ def crawl_github_org(data): assert list(data) == ['datalad_stats'], data # TODO: redo with proper integration - g = gh.Github(cfg.obtain('hub.oauthtoken')) + g = gh.Github(_get_github_token(obtain=True)) entity = g.get_organization(org) all_repos = list(entity.get_repos(repo_type)) diff --git a/datalad_crawler/pipelines/tests/test_gh.py b/datalad_crawler/pipelines/tests/test_gh.py index 4630cea..1072c43 100644 --- a/datalad_crawler/pipelines/tests/test_gh.py +++ b/datalad_crawler/pipelines/tests/test_gh.py @@ -19,14 +19,15 @@ ) import pytest +from ..gh import _get_github_token @skip_if_no_network @with_tempfile def test_crawl(tempd=None): if not github: pytest.skip("no github package") - # ATM tests completely overload HOME so TODO: do make it use credentials system - if not cfg.get('hub.oauthtoken'): + # set DATALAD_TESTS_CREDENTIALS=system to use system credentials + if not _get_github_token(obtain=False): pytest.skip("no github credentials") ds = create(tempd) with chpwd(tempd): From 66e9b2fbafc483396a3fd285a6948b08abc114bf Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 25 May 2023 22:26:53 -0400 Subject: [PATCH 3/3] skip repo if no commits --- datalad_crawler/pipelines/gh.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datalad_crawler/pipelines/gh.py b/datalad_crawler/pipelines/gh.py index 8b49b7b..7e6ee7a 100644 --- a/datalad_crawler/pipelines/gh.py +++ b/datalad_crawler/pipelines/gh.py @@ -121,6 +121,14 @@ def crawl_github_org(data): # etc, we will just skip for now continue + # See if it has anything committed - we will not clone empty ones + try: + if not any(repo.get_commits()): + raise ValueError("no commits") + except Exception as exc: + lgr.info("Skipping %s since: %s", name, exc) + continue + # TODO: all the recursive etc options try: ds = superds.install(