In [17]:
import requests
from functools import lru_cache
from collections import defaultdict, Counter
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json

In [2]:
FILE_LISTING = requests.get('https://raw.githubusercontent.com/regro/libcfgraph/master/.file_listing.json').json()
# TODO: upstream this to libcfgraph so we just request it, so we reduce bandwidth requirements
ARTIFACT_TO_PKG = {v.split('/')[-1].rsplit('.', 1)[0]: v.split('/')[1] for v in FILE_LISTING}

In [3]:
sub_listing = [f.split('/')[-1].split('.')[0] for f in FILE_LISTING if 'import_maps/' in f]

In [4]:
import json
hubs_auths = json.load(open('/home/christopher/dev/conda-forge/cf-graph-countyfair/ranked_hubs_authorities.json'))

In [5]:
import string
alpha = string.ascii_lowercase

In [6]:
@lru_cache()
def _import_map_cache(import_first_two_letters):
    url = f'https://raw.githubusercontent.com/regro/libcfgraph/master/import_maps/{import_first_two_letters}.json'.lower()
    return {k: set(v['elements']) for k, v in requests.get(url).json().items()}

In [11]:
def extract_pkg_from_import(name):
    ftl = name[:2]
    import_map = _import_map_cache(ftl)
    supplying_artifacts = import_map[name]
    import_to_artifact = {name: supplying_artifacts}
    supplying_pkgs = {ARTIFACT_TO_PKG[k] for k in supplying_artifacts}
    import_to_pkg = {name: supplying_pkgs}
    pkgs_to_artifacts = defaultdict(set)
    for a in supplying_artifacts:
        pkgs_to_artifacts[ARTIFACT_TO_PKG[a]].add(a)
    import_to_pkg_to_artifact = {name: pkgs_to_artifacts}

    return next(iter(k for k in hubs_auths if k in supplying_pkgs), None), import_to_artifact, import_to_pkg, import_to_pkg_to_artifact

In [12]:
multi_defined_pkgs = defaultdict(set)
clobbered_imports = set()

In [13]:
for import_map_name in tqdm(sub_listing):
    for import_name in _import_map_cache(import_map_name.lower()):
        if any(import_name.startswith(k) for k in ['tests.', 'test.', 'examples.', 'example.']):
            continue
        best, i_to_arts, i_to_pkgs, import_to_pkg_to_artifact = extract_pkg_from_import(import_name)
        if not best:
            continue
        pkgs = set().union(*[v for k, v in import_to_pkg_to_artifact[import_name].items() if k != best])
        if pkgs:
            multi_defined_pkgs[best].update(pkgs)
            clobbered_imports.add(import_name)

100%|██████████| 513/513 [03:30<00:00,  2.44it/s]


In [14]:
len(clobbered_imports)

16611

In [15]:
len(multi_defined_pkgs)

385

In [16]:
[(a, len(multi_defined_pkgs[a])) for a in sorted(multi_defined_pkgs, key=lambda x: len(multi_defined_pkgs[x]))]


[('mistune', 1),
 ('cached-property', 1),
 ('dbfread', 1),
 ('sk-video', 1),
 ('labjackpython', 1),
 ('regex', 1),
 ('defusedxml', 1),
 ('pyexcel-io', 1),
 ('prettytable', 1),
 ('privy', 1),
 ('progressbar', 1),
 ('prometheus_client', 1),
 ('dropbox', 1),
 ('intake-accumulo', 1),
 ('yodatools', 1),
 ('extras', 1),
 ('block_tracing', 1),
 ('python-gnupg', 1),
 ('creoleparser', 1),
 ('jeepney', 1),
 ('ipython', 1),
 ('ipykernel', 1),
 ('ipywidgets', 1),
 ('python-magic', 1),
 ('pysocks', 1),
 ('keyring', 1),
 ('keyrings.alt', 1),
 ('futures', 1),
 ('python-coreapi', 1),
 ('coreschema', 1),
 ('flatten_json', 1),
 ('aiida-core', 1),
 ('ruamel', 1),
 ('nbconvert', 1),
 ('nbformat', 1),
 ('argo-workflows', 1),
 ('sherlockml-boltzmannclean', 1),
 ('pickleshare', 1),
 ('diffpy.structure', 1),
 ('widgetsnbextension', 1),
 ('lml', 1),
 ('shapely', 1),
 ('hdijupyterutils', 1),
 ('wcwidth', 1),
 ('secretstorage', 1),
 ('send2trash', 1),
 ('sentry-sdk', 1),
 ('et_xmlfile', 1),
 ('sat-search', 1),
 

In [None]:
dict(multi_defined_pkgs)

In [18]:
c = Counter([k.split('.')[0] for k in clobbered_imports])

In [19]:
c.most_common(10)

[('tensorflow', 2109),
 ('openquake', 804),
 ('pandas', 608),
 ('scipy', 577),
 ('silx', 565),
 ('numpy', 433),
 ('glue', 371),
 ('ansible', 363),
 ('theano', 347),
 ('hyperspy', 327)]

In [20]:
multi_defined_pkgs['pandas']

{'autovizwidget-0.12.6-py_1000'}

In [27]:
multi_defined_pkgs['pygments']

{'labelme-3.16.1-py37h47b7e1a_0',
 'pyproprop-0.4.1-py_0',
 'pyrobuf-0.9.0-py36h4a8c4bd_0',
 'ffx-1.3.4-py35_0',
 'trio_asyncio-0.11.0-py37hc8dfbb8_0',
 'pycroscopy-0.60.0-py27_0',
 'pyrobuf-0.8.5-py37h0a44026_1000',
 'plasmaboundaries-0.1.4-pyh9f0ad1d_0',
 'optuna-1.4.0-py_0',
 'pybtex-0.22.2-py37_0',
 'pyrobuf-0.9.0-py27h4a8c4bd_0',
 'andes-1.0.4-py37hc8dfbb8_0',
 'deepforest-0.3.0-py38hc84c608_0',
 'landlab-2.0.0b5-py36hc8d92b1_0',
 'fasttsne-0.2.7-py36h7eb728f_1',
 'yt_astro_analysis-1.0.0-py39h49efd1d_1001',
 'point_cloud_utils-0.13.0-py36hbf1eeb5_0',
 'cmreshandler-1.0.0-py36_0',
 'terrainbento-1.0.0-py_0',
 'otwrapy-0.7-py35_1',
 'xsum-1.1.1-py39h86d69c3_2',
 'otwrapy-0.9-pyh3d53f50_2',
 'pyrobuf-0.9.0-py37h6de7cb9_0',
 'pykafka-2.8.0-py38h43c8af4_1001',
 'igl-0.3-py36hd0167a9_0',
 'bamnostic-1.0.4-py_0',
 'pyrobuf-0.9.3-py36h831f99a_1',
 'pfapack-0.2.1-py36hc4c5385_0',
 'jaws-0.6.2-py37_0',
 'polyfempy-0.2.1-py37h6538335_1',
 'trio-asyncio-0.10.0-py37_0',
 'igl-0.3.1-py36h6dc7a