Skip to content

Commit

Permalink
Merge pull request #136 from yarikoptic/bfs
Browse files Browse the repository at this point in the history
Minimal surgery to get github orgs crawling working again
  • Loading branch information
yarikoptic committed May 26, 2023
2 parents 0de99c3 + 66e9b2f commit 857ada2
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 15 deletions.
34 changes: 28 additions & 6 deletions datalad_crawler/pipelines/gh.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

import re

from datalad import cfg

from datalad.api import Dataset, install
from datalad.downloaders.credentials import Token
from datalad.support import path as op
from datalad.support.gitrepo import GitRepo
from datalad.utils import (
Expand All @@ -31,6 +34,19 @@
lgr = getLogger("datalad.crawler.pipelines.github")


def _get_github_token(obtain=False) -> str:
# Quick and dirty adapter which would use stored Token if was stored in credentials
# or just access in cfg if present
try:
token = Token('api.github.com')()['token']
if not token:
raise ValueError("Empty value for token is stored")
return token
except Exception as exc:
lgr.warning("Failed to get api.github.com credential: %s", exc)
return (cfg.obtain if obtain else cfg.get)('hub.oauthtoken')


def pipeline(org=None,
repo_type='sources', include='.*', exclude=None,
metadata_nativetypes=None, aggregate=False,
Expand Down Expand Up @@ -71,19 +87,17 @@ def pipeline(org=None,
drop_data = assure_bool(drop_data)

import github as gh
# TODO: consider elevating that function to a "public" helper
from datalad.support.github_ import _gen_github_entity
superds = Dataset('.')
if metadata_nativetypes:
metadata_nativetypes = assure_list_from_str(metadata_nativetypes, sep=',')

aggregate_later = []
def crawl_github_org(data):
assert list(data) == ['datalad_stats'], data
# TODO: actually populate the datalad_stats with # of datasets and
# possibly amount of data downloaded in get below
# Needs DataLad >= 0.13.6~7^2~3 where password was removed
entity, cred = next(_gen_github_entity(None, org))

# TODO: redo with proper integration
g = gh.Github(_get_github_token(obtain=True))
entity = g.get_organization(org)
all_repos = list(entity.get_repos(repo_type))

for repo in all_repos:
Expand All @@ -107,6 +121,14 @@ def crawl_github_org(data):
# etc, we will just skip for now
continue

# See if it has anything committed - we will not clone empty ones
try:
if not any(repo.get_commits()):
raise ValueError("no commits")
except Exception as exc:
lgr.info("Skipping %s since: %s", name, exc)
continue

# TODO: all the recursive etc options
try:
ds = superds.install(
Expand Down
18 changes: 9 additions & 9 deletions datalad_crawler/pipelines/tests/test_gh.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
from datalad.utils import chpwd

from datalad import cfg
from datalad.api import (
crawl,
crawl_init,
create,
)
try:
from datalad.support.github_ import _get_github_cred
import github
except ImportError:
# might be dated which has not merged
# https://github.com/datalad/datalad/pull/4400 yet
from datalad.downloaders.credentials import UserPassword
def _get_github_cred():
"""Trimmed down helper"""
return UserPassword("github", "does not matter")
github = None

from datalad.tests.utils_pytest import (
assert_false,
Expand All @@ -23,12 +19,16 @@ def _get_github_cred():
)
import pytest

from ..gh import _get_github_token

@skip_if_no_network
@with_tempfile
def test_crawl(tempd=None):
if not _get_github_cred().is_known:
pytest.skip("no github credential")
if not github:
pytest.skip("no github package")
# set DATALAD_TESTS_CREDENTIALS=system to use system credentials
if not _get_github_token(obtain=False):
pytest.skip("no github credentials")
ds = create(tempd)
with chpwd(tempd):
crawl_init(
Expand Down

0 comments on commit 857ada2

Please sign in to comment.