In [2]:
import pandas as pd

rand_sample = pd.read_excel("data/random_sample.xlsx")
print(
    f'{len(rand_sample[rand_sample["TensorFlow"]])} randomly sampled packages in TensorFlow SC'
)
print(
    f'{len(rand_sample[rand_sample["PyTorch"]])} randomly sampled packages in TensorFlow SC'
)

334 randomly sampled packages in TensorFlow SC
344 randomly sampled packages in TensorFlow SC


In [3]:
print(
    f'{len(rand_sample[rand_sample["Category"] == "Unclear"])} packages are labeled as Unclear'
)
rand_sample = rand_sample[rand_sample["Category"] != "Unclear"]
print(
    f'{len(rand_sample[rand_sample["TensorFlow"]])} packages (exclude Unclear package) in TensorFlow SC'
)
print(
    f'{len(rand_sample[rand_sample["PyTorch"]])} packages (exclude Unclear package) in TensorFlow SC'
)

71 packages are labeled as Unclear
298 packages (exclude Unclear package) in TensorFlow SC
309 packages (exclude Unclear package) in TensorFlow SC


In [4]:
for f in ["TensorFlow", "PyTorch"]:
    ais_ratio = len(
        rand_sample[
            rand_sample[f]
            & (
                rand_sample["Category"].isin(
                    ["Applications", "Infrastructure", "Sciences"]
                )
            )
        ]
    ) / len(rand_sample[rand_sample[f]])
    a_ratio = len(
        rand_sample[rand_sample[f] & (rand_sample["Category"] == "Applications")]
    ) / len(rand_sample[rand_sample[f]])
    i_ratio = len(
        rand_sample[rand_sample[f] & (rand_sample["Category"] == "Infrastructure")]
    ) / len(rand_sample[rand_sample[f]])
    print(
        f"{f}: {ais_ratio:.2%} Applications, Infrastructure, or Sciences packages, {a_ratio:.2%} Applications packages, {i_ratio:.2%} Infrastructure packages"
    )

TensorFlow: 89.60% Applications, Infrastructure, or Sciences packages, 42.62% Applications packages, 28.19% Infrastructure packages
PyTorch: 87.38% Applications, Infrastructure, or Sciences packages, 52.10% Applications packages, 20.39% Infrastructure packages


In [5]:
missing_dependencies = pd.read_excel("data/missing_dependencies.xlsx")
len(missing_dependencies), len(missing_dependencies[missing_dependencies["Match"] == 0])

(100, 6)

In [6]:
for name in missing_dependencies[missing_dependencies["Match"] == 0]["Package"]:
    print(name)

yqn-pytorch-framework-0.3.0
soltrannet-1.0.0
rxnfp-0.1.0
flexneuart-1.1
falconcv-0.1.2
metroem-0.0.7


In [7]:
downloads = pd.read_csv("data/downloads.csv")
ps = [
    "yqn-pytorch-framework",
    "soltrannet",
    "rxnfp",
    "flexneuart",
    "falconcv",
    "metroem",
]
downloads[downloads["project"].isin(ps)]

Unnamed: 0,project,num_downloads
58704,yqn-pytorch-framework,894
75279,metroem,647
77741,rxnfp,620
153772,flexneuart,244
181735,falconcv,185
250242,soltrannet,95


In [8]:
tf_edges = pd.read_csv(
    "data/TensorFlow/TensorFlow_edges.csv", header=None, names=["down", "up"]
)
pt_edges = pd.read_csv(
    "data/PyTorch/PyTorch_edges.csv", header=None, names=["down", "up"]
)
for name in ps:
    print(
        name,
        len(tf_edges[tf_edges["up"] == name]),
        len(pt_edges[pt_edges["up"] == name]),
    )

yqn-pytorch-framework 0 0
soltrannet 0 0
rxnfp 0 0
flexneuart 0 0
falconcv 0 0
metroem 0 0


In [9]:
import pandas as pd
from pymongo import MongoClient

db = MongoClient("127.0.0.1", 27017)["dlsc"]
tf_nodes = pd.DataFrame(db["TensorFlow_nodes"].find({}, projection={"_id": 0}))
pt_nodes = pd.DataFrame(db["PyTorch_nodes"].find({}, projection={"_id": 0}))
tf_nodes["upload_time"] = pd.to_datetime(tf_nodes["upload_time"])
pt_nodes["upload_time"] = pd.to_datetime(pt_nodes["upload_time"])

In [10]:
icse22_tf_nodes = pd.read_csv("data/tensorflow_package_time.csv", index_col=0)
icse22_tf_nodes["start_time"] = pd.to_datetime(icse22_tf_nodes["start_time"])
icse22_pt_nodes = pd.read_csv("data/torch_package_time.csv", index_col=0)
icse22_pt_nodes["start_time"] = pd.to_datetime(icse22_pt_nodes["start_time"])
print(
    f"ICSE 2022: {len(icse22_tf_nodes)} packages in TensorFlow SC, {len(icse22_pt_nodes)} packages in PyTorch SC"
)

ICSE 2022: 1022 packages in TensorFlow SC, 697 packages in PyTorch SC


In [11]:
import re

pkg2repos = {}
for doc in db["projects"].find(
    {"Platform": "Pypi", "Repository URL": {"$regex": re.compile(r"https:\/\/")}},
    projection={"_id": 0, "Name": 1, "Repository URL": 1},
):
    name, repo = doc["Name"], doc["Repository URL"]
    chunks = repo.split("//")[1].split("/")
    if len(chunks) == 3:
        if chunks[0] == "github.com":
            pkg2repos[chunks[1] + "_" + chunks[2]] = name
        else:
            pkg2repos[chunks[0] + "_" + chunks[1] + "_" + chunks[2]] = name
print(f"{len(pkg2repos)} packages with repositories in Libraries.io")

145910 packages with repositories in Libraries.io


In [12]:
icse22_tf_nodes["name"] = icse22_tf_nodes["label"].apply(
    lambda x: pkg2repos.get(x, "").lower()
)
icse22_pt_nodes["name"] = icse22_pt_nodes["label"].apply(
    lambda x: pkg2repos.get(x, "").lower()
)
icse22_all_nodes = set(icse22_pt_nodes["name"]).union(set(icse22_tf_nodes["name"]))
print(f"{len(icse22_all_nodes)} packages in total in ICSE 22")

1417 packages in total in ICSE 22


In [13]:
tf_nodes_comp = set(
    tf_nodes[tf_nodes["upload_time"] <= "2019-11-10"]["name"].str.lower()
)
pt_nodes_comp = set(
    pt_nodes[pt_nodes["upload_time"] <= "2019-11-10"]["name"].str.lower()
)
all_nodes_comp = tf_nodes_comp.union(pt_nodes_comp)
print(
    f"Before 2019-11-10, {len(tf_nodes_comp)} packages in our TensorFlow SC, {len(pt_nodes_comp)} packages in our PyTorch SC, {len(all_nodes_comp)} packages in total"
)
print(
    f"{len(all_nodes_comp.intersection(icse22_all_nodes))} shared packages between ours and ICSE 22"
)

Before 2019-11-10, 706 packages in our TensorFlow SC, 418 packages in our PyTorch SC, 1087 packages in total
315 shared packages between ours and ICSE 22


In [14]:
all_pypi_pkgs = pd.DataFrame(
    db["distribution_metadata"].find(
        {}, projection={"_id": 0, "name": 1, "upload_time": 1}
    )
)
print(len(all_pypi_pkgs), "distributions")
all_pypi_pkgs_comp = set(
    all_pypi_pkgs[all_pypi_pkgs["upload_time"] <= "2019-11-10"]["name"].str.lower()
)
print(len(all_pypi_pkgs_comp), "distributions before 2019-11-10")

5743721 distributions
180925 distributions before 2019-11-10


In [15]:
download_2019 = (
    pd.DataFrame({"project": list(all_nodes_comp)})
    .merge(pd.read_csv("data/downloads_2019_11_10.csv"), how="left")
    .fillna(0)
    .rename(columns={"project": "name"})
)
most_download_2019 = download_2019[download_2019["num_downloads"] >= 2218]
print(len(most_download_2019), "packages with above average monthly downloads")
most_download = pd.read_excel("data/most_downloaded_packages.xlsx")
most_download_2019 = most_download_2019[["name"]].merge(most_download, how="left")
most_download_2019.loc[:, "TensorFlow"] = False
most_download_2019.loc[:, "PyTorch"] = False
most_download_2019.loc[
    most_download_2019["name"].isin(tf_nodes_comp), "TensorFlow"
] = True
most_download_2019.loc[most_download_2019["name"].isin(pt_nodes_comp), "PyTorch"] = True
most_download_2019[
    [
        "name",
        "PyTorch",
        "TensorFlow",
        "pypi_page",
        "repository",
        "description",
        "final code",
    ]
].to_csv("data/most_downloaded_packages_2019.csv", index=False)

233 packages with above average monthly downloads


In [16]:
most_download_2019 = pd.read_excel("data/most_downloaded_packages_2019.xlsx")
most_download_2019[["Category", "Domain"]] = most_download_2019["final code"].str.split(
    "->", expand=True
)
most_download_2019 = most_download_2019[most_download_2019["Category"] != "Unclear"]
print(len(most_download_2019[most_download_2019["TensorFlow"]]), "TensorFlow packages")

print(
    len(
        most_download_2019[
            most_download_2019["TensorFlow"]
            & most_download_2019["Category"].isin(
                (
                    "Applications",
                    "Sciences",
                    "Probablistics Methods",
                    "Reinforcement Learning",
                )
            )
        ]
    )
)

print(len(most_download_2019[most_download_2019["PyTorch"]]), "PyTorch packages")

print(
    len(
        most_download_2019[
            most_download_2019["PyTorch"]
            & most_download_2019["Category"].isin(
                (
                    "Applications",
                    "Sciences",
                    "Probablistics Methods",
                    "Reinforcement Learning",
                )
            )
        ]
    )
)

158 TensorFlow packages
81
68 PyTorch packages
42


In [17]:
most_download_2019[most_download_2019["TensorFlow"]]["Category"].value_counts()

Infrastructure            63
Applications              55
Sciences                  19
Miscellaneous Tools        8
Social Aspects             5
Reinforcement Learning     5
Probablistics Methods      2
Education                  1
Name: Category, dtype: int64

In [18]:
print(
    most_download_2019[most_download_2019["TensorFlow"]]["Domain"]
    .value_counts()
    .head(5)
)
print(
    most_download_2019[most_download_2019["PyTorch"]]["Domain"].value_counts().head(5)
)

NLP                    30
Framework              19
CV                     16
Monitoring              8
Miscellaneous Tools     8
Name: Domain, dtype: int64
NLP                    18
CV                      9
Framework               6
Monitoring              5
Miscellaneous Tools     5
Name: Domain, dtype: int64


In [19]:
tf_edges = pd.DataFrame(
    db["TensorFlow_edges"].find(
        {},
        projection={
            "_id": 0,
            "name": 1,
            "version": 1,
            "dependency": 1,
            "dependency_version": 1,
        },
    )
)
pt_edges = pd.DataFrame(
    db["PyTorch_edges"].find(
        {},
        projection={
            "_id": 0,
            "name": 1,
            "version": 1,
            "dependency": 1,
            "dependency_version": 1,
        },
    )
)
tf_nodes = pd.DataFrame(
    db["TensorFlow_nodes"].find(
        {}, projection={"_id": 0, "name": 1, "version": 1, "upload_time": 1}
    )
)
pt_nodes = pd.DataFrame(
    db["PyTorch_nodes"].find(
        {}, projection={"_id": 0, "name": 1, "version": 1, "upload_time": 1}
    )
)
tf_nodes["upload_time"] = pd.to_datetime(tf_nodes["upload_time"])
pt_nodes["upload_time"] = pd.to_datetime(pt_nodes["upload_time"])
tf_nodes_2019 = tf_nodes[tf_nodes["upload_time"] <= "2019-11-10"]
pt_nodes_2019 = pt_nodes[pt_nodes["upload_time"] <= "2019-11-10"]
tf_pkgs_2019 = set(tf_nodes_2019["name"].str.lower())
pt_pkgs_2019 = set(pt_nodes_2019["name"].str.lower())
download_2019 = pd.read_csv("data/downloads_2019_11_10.csv")
download_2019["project"] = download_2019["project"].str.lower()

In [20]:
tf_nodes_download_2019 = (
    pd.DataFrame({"project": list(tf_pkgs_2019)})
    .merge(download_2019, how="left")
    .fillna(0)
    .rename(columns={"project": "name"})
)
pt_nodes_download_2019 = (
    pd.DataFrame({"project": list(pt_pkgs_2019)})
    .merge(download_2019, how="left")
    .fillna(0)
    .rename(columns={"project": "name"})
)

In [21]:
shared_packages = tf_pkgs_2019.union(pt_pkgs_2019).intersection(icse22_all_nodes)
our_unique_packages = tf_pkgs_2019.union(pt_pkgs_2019) - icse22_all_nodes
len(shared_packages), len(our_unique_packages)

(315, 772)

In [22]:
our_unique_download = (
    pd.DataFrame({"project": list(our_unique_packages)})
    .merge(download_2019, how="left")
    .fillna(0)
    .sort_values("num_downloads", ascending=False)
)
len(our_unique_download[our_unique_download["num_downloads"] >= 2218])

185

In [23]:
dependents_info = pd.concat(
    (
        tf_edges.merge(tf_nodes_2019[["name", "version"]]).merge(
            tf_nodes_2019[["name", "version"]].rename(
                columns={"name": "dependency", "version": "dependency_version"}
            )
        ),
        pt_edges.merge(pt_nodes_2019[["name", "version"]]).merge(
            pt_nodes_2019[["name", "version"]].rename(
                columns={"name": "dependency", "version": "dependency_version"}
            )
        ),
    )
)
print(dependents_info["dependency"].nunique(), "packages have dependents")
print(
    dependents_info[
        dependents_info["dependency"].isin(
            our_unique_packages
            - {"tensorflow", "tensorflow-cpu", "tensorflow-gpu", "torch"}
        )
    ]["dependency"].nunique(),
    "not-covered packages have dependents",
)

115 packages have dependents
85 not-covered packages have dependents


In [24]:
dependents_info[
    dependents_info["dependency"].isin(
        our_unique_packages
        - {"tensorflow", "tensorflow-cpu", "tensorflow-gpu", "torch"}
    )
].groupby("dependency")["name"].nunique().sort_values(ascending=False).head()

dependency
torchvision                130
torchtext                   21
pytorch-transformers        13
tensorflow-probability      12
pytorch-pretrained-bert     11
Name: name, dtype: int64

In [52]:
icse_unique_packages = list(icse22_all_nodes - (tf_pkgs_2019.union(pt_pkgs_2019)))
dist_meta = pd.DataFrame(
    db["distribution_metadata"].find(
        {},
        projection={
            "_id": 0,
            "name": 1,
            "version": 1,
            "requires_dist": 1,
            "upload_time": 1,
            "packagetype": 1,
        },
    )
)
icse_unique_pkg_meta = dist_meta[dist_meta["name"].isin(icse_unique_packages)].copy()
icse_unique_pkg_meta["upload_time"] = pd.to_datetime(
    icse_unique_pkg_meta["upload_time"]
)
icse_unique_pkg_meta = icse_unique_pkg_meta[
    icse_unique_pkg_meta["upload_time"] <= "2019-11-10"
]
icse_unique_latest = (
    icse_unique_pkg_meta[icse_unique_pkg_meta["requires_dist"].str.len() > 0]
    .sort_values("upload_time", ascending=False)
    .drop_duplicates("name")
)
icse_unique_latest[icse_unique_latest["requires_dist"].str.len() > 0].sample(
    100, random_state=42
)[["name", "version", "requires_dist"]].to_csv(
    "data/unneeded_dependency.csv", index=False
)