In [32]:
import httpx

# https://hugovk.github.io/top-pypi-packages/
top_pkg_url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
top_pkg_info = httpx.get(top_pkg_url).json()
top_counts = {entry['project']:entry['download_count'] for entry in top_pkg_info['rows']}

In [45]:
import httpx
from packaging.requirements import Requirement
from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
# https://warehouse.pypa.io/api-reference/json.html
# https://pypi.org/pypi/tox/json
deps = defaultdict(lambda: defaultdict(list))

def load_package(pkg):
    info = httpx.get(f"https://pypi.org/pypi/{pkg}/json").json()["info"]
    for req_str in (info["requires_dist"] or []):
        req = Requirement(req_str)
        if req.marker is None or req.marker._markers is None:
            extras = tuple()
        else:
            extras = tuple(
                r.value
                for l, _, r in (i for i in req.marker._markers if isinstance(i, tuple) and len(i) == 3)
                if l.value == "extra"
            )
        deps[req.name][extras].append(pkg)

with tqdm(total=len(top_counts)) as bar:
    with ThreadPoolExecutor(max_workers=12) as executor:
        futures = [executor.submit(load_package, p) for p in top_counts]
        for future in as_completed(futures):
            bar.update()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:50<00:00, 17.22it/s]


In [60]:
counts = {pkg: sorted(((top_counts[k], k) for k in set(deps[pkg][tuple()])), reverse=True) for pkg in top_counts}

In [83]:
import pandas as pd

table = []
for pkg, values in counts.items():
    for at, (c, v) in enumerate(values, start=1):
        # note the percentage is a rhough estimate, and sometimes will be >100
        # e.g. when pkg A requires six and package B that also requires six
        table.append((pkg, at, v, c, 100*c/top_counts[pkg]))

pd.set_option('display.max_rows', len(table))
pd.DataFrame(table)

Unnamed: 0,0,1,2,3,4
0,urllib3,1,requests,114176250,73.68619
1,urllib3,2,elasticsearch,6702368,4.325523
2,urllib3,3,kubernetes,4636242,2.992102
3,urllib3,4,azureml-core,3675943,2.372352
4,urllib3,5,responses,3358773,2.167659
5,urllib3,6,geoip2,1871237,1.207645
6,urllib3,7,azure-cli-core,1847353,1.19223
7,urllib3,8,instana,714547,0.461149
8,urllib3,9,pyppeteer,686619,0.443125
9,urllib3,10,minio,665468,0.429475
