In [1]:
from pymongo import MongoClient
import pandas as pd

dist_file_info = MongoClient("127.0.0.1", 27017)["radar"]["distribution_file_info"]
df = pd.DataFrame(dist_file_info.find({}, projection={"_id": 0}))
df["upload_time"] = pd.to_datetime(df["upload_time"])
print(len(df))
df = df[
    df["packagetype"].isin(["sdist", "bdist_wheel", "bdist_egg"])
    & df["filename"].str.endswith((".tar.gz", ".zip", ".whl", ".egg"))
]
print(len(df))

7685017
7657061


In [2]:
df["packagetype"].value_counts()

bdist_wheel    3805987
sdist          3731536
bdist_egg       119538
Name: packagetype, dtype: int64

In [3]:
sdist_packages = set(df[df["packagetype"] == "sdist"]["name"].unique())
bdist_packages = set(
    df[df["packagetype"].isin(["bdist_wheel", "bdist_egg"])]["name"].unique()
)
both_packages = sdist_packages.intersection(bdist_packages)
print(
    f"Number of sdist packages: {len(sdist_packages)}, Number of packages with bsit: {len(bdist_packages)}"
)
print(
    f"intersection: {len(both_packages)}, sdist-only packages: {len(sdist_packages) - len(both_packages)}, bdist-only packages: {len(bdist_packages) - len(both_packages)}"
)

Number of sdist packages: 377742, Number of packages with bsit: 283885
intersection: 246081, sdist-only packages: 131661, bdist-only packages: 37804


In [4]:
sdist_releases = df[df["packagetype"] == "sdist"][["name", "version"]].drop_duplicates()
bdist_releases = df[df["packagetype"].isin(["bdist_wheel", "bdist_egg"])][
    ["name", "version"]
].drop_duplicates()
both_releases = sdist_releases.merge(bdist_releases)
print(
    f"Number of sdist releases: {len(sdist_releases)}, Number of bdist releases: {len(bdist_releases)}"
)
print(
    f"intersection: {len(both_releases)}, sdist-only releases: {len(sdist_releases) - len(both_releases)}, bdist-only releases: {len(bdist_releases) - len(both_releases)}"
)

Number of sdist releases: 3719068, Number of bdist releases: 2892007
intersection: 2419223, sdist-only releases: 1299845, bdist-only releases: 472784


In [5]:
recent_df = df[df["upload_time"] > (df["upload_time"].max() - pd.DateOffset(years=1))]
print(len(recent_df))
print()
print(recent_df["packagetype"].value_counts())
print()

recent_sdist_packages = set(
    recent_df[recent_df["packagetype"] == "sdist"]["name"].unique()
)
recent_bdist_packages = set(
    recent_df[recent_df["packagetype"].isin(["bdist_wheel", "bdist_egg"])][
        "name"
    ].unique()
)
recent_both_packages = recent_sdist_packages.intersection(recent_bdist_packages)
print(
    f"Number of sdist packages: {len(recent_sdist_packages)}, Number of packages with bsit: {len(recent_bdist_packages)}"
)
print(
    f"intersection: {len(recent_both_packages)}, sdist-only packages: {len(recent_sdist_packages) - len(recent_both_packages)}, bdist-only packages: {len(recent_bdist_packages) - len(recent_both_packages)}"
)
print()

recent_sdist_releases = recent_df[recent_df["packagetype"] == "sdist"][
    ["name", "version"]
].drop_duplicates()
recent_bdist_releases = recent_df[
    recent_df["packagetype"].isin(["bdist_wheel", "bdist_egg"])
][["name", "version"]].drop_duplicates()
recent_both_releases = recent_sdist_releases.merge(recent_bdist_releases)
print(
    f"Number of sdist releases: {len(recent_sdist_releases)}, Number of bdist releases: {len(recent_bdist_releases)}"
)
print(
    f"intersection: {len(recent_both_releases)}, sdist-only releases: {len(recent_sdist_releases) - len(recent_both_releases)}, bdist-only releases: {len(recent_bdist_releases) - len(recent_both_releases)}"
)

2110910

bdist_wheel    1238021
sdist           864463
bdist_egg         8426
Name: packagetype, dtype: int64

Number of sdist packages: 118513, Number of packages with bsit: 112306
intersection: 96158, sdist-only packages: 22355, bdist-only packages: 16148

Number of sdist releases: 864463, Number of bdist releases: 859974
intersection: 732751, sdist-only releases: 131712, bdist-only releases: 127223


In [6]:
from packaging.version import Version


def not_prerelease(x):
    try:
        return not Version(x).is_prerelease
    except:
        return False


sampled_release = both_releases.groupby("name").sample(n=1, random_state=1)
print(
    len(sampled_release),
    len(sampled_release[sampled_release["version"].apply(not_prerelease)]),
)

243529 230818


In [7]:
sampled_release_info = sampled_release.merge(df)
sampled_release_info.to_csv("data/sampled_releases.csv", index=False)
print(len(sampled_release_info))

546610
