In [1]:
import pandas as pd
from pymongo import MongoClient

from tqdm import tqdm

col = MongoClient("127.0.0.1", 27017)["dlsc"]["distribution_metadata"]

In [2]:
import re

pattern = re.compile(
    r"(github\.com|bitbucket\.org|gitlab\.com)/[a-zA-Z0-9_\.\-]+/[a-zA-Z0-9_\.\-]+"
)

def parse_metadata(metadata):
    if not metadata:
        return None
    home_page = metadata.get("home_page")
    download_url = metadata.get("download_url")
    project_urls = metadata.get("project_urls", [])
    if home_page:
        match = pattern.search(home_page)
        if match:
            return "https://" + match.group(0)
    if download_url:
        match = pattern.search(download_url)
        if match:
            return "https://" + match.group(0)
    if project_urls:
        for url in project_urls:
            match = pattern.search(url)
            if match:
                return "https://" + match.group(0)
    return None

In [27]:
pkg_repo_urls = {}
cnt = 0

for metadata in tqdm(col.find({}, projection={"_id": 0, "name": 1, "home_page": 1, "download_url": 1, "project_urls": 1})):
    # cnt += 1
    # if cnt == 66161:
    #     # print(metadata)
    #     # break
    name = metadata["name"].lower()
    if not pkg_repo_urls.get(name):
        pkg_repo_urls[name] = parse_metadata(metadata)

5743721it [00:41, 138906.53it/s]


In [28]:
len(pkg_repo_urls), len([v for v in pkg_repo_urls.values() if v is not None])

(354636, 243534)

In [29]:
1 - len([v for v in pkg_repo_urls.values() if v is not None]) / len(pkg_repo_urls)

0.3132846073156702

In [6]:
from packaging.version import Version
from pymongo import MongoClient
import pandas as pd
from random import sample
import wget
import requests

db = MongoClient("127.0.0.1", port=27017)["dlsc"]


def download_dists(sampled_dependents, sc):
    node_col = db[f"{sc}_nodes"]
    edge_col = db[f"{sc}_edges"]

    res = []

    for samp in sampled_dependents:
        print(f"Start {samp}")
        versions = [v["version"] for v in node_col.find({"name": samp})]
        latest_version = sorted(versions, key=lambda x: Version(x))[-1]

        deps = edge_col.find({"name": samp, "version": latest_version}).distinct("dependency")

        response = requests.get(f"https://pypi.org/pypi/{samp}/{latest_version}/json")
        if response.status_code == 200:
            try:
                urls = response.json()["urls"]
                for data in urls:
                    filename = data['filename']
                    url = data['url']
                    packagetype = data['packagetype']
                    if packagetype in ["bdist_wheel", "bdist_egg"]:
                        break
                wget.download(url, f"dists/{sc}/{filename}")
                for d in deps:
                    res.append([samp, latest_version, d, filename])
            except Exception as e:
                print(f"Error: {samp} {latest_version}")
                print(e)
                for d in deps:
                    res.append([samp, latest_version, d, None])
        else:
            print(f"404: {samp} {latest_version}")
            for d in deps:
                res.append([samp, latest_version, d, None])
        print(f"Finish {samp}")
    return res

def sample_packages(sc: str, k: int):
    df = pd.read_csv(f"data/{sc}/{sc}_edges.csv", names=["dependent", "dependency"])
    dependents = list(df["dependent"].unique())
    sampled_dependents = sample(dependents, k)

    res = download_dists(sampled_dependents, sc)
    pd.DataFrame(res, columns=["name", "version", "dependencies", "filepath"]).to_csv(f"data/{sc}/{sc}_sampled.csv", index=False)
    

In [None]:
sample_packages("TensorFlow", 334)

In [None]:
sample_packages("PyTorch", 344)

In [None]:
import zipfile
import requests
import wget
def get_import_name(name: str):
    url = f"https://pypi.org/pypi/{name}/json"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = response.json()["urls"]
            for d in data:
                if d["packagetype"] == "bdist_wheel":
                    path = f"dists/{d['filename']}"
                    wget.download(d["url"], path)
                    zipf = zipfile.ZipFile(path, 'r')
                    files = zipf.namelist()
                    for f in files:
                        if f.endswith("top_level.txt"):
                            with zipf.open(f) as f:
                                return f.read().decode("utf-8").split("\n")[:-1]
        except Exception as e:
            print(e)
            return None

In [None]:
tf_df = pd.read_csv("data/TensorFlow/TensorFlow_sampled.csv")
tf_import_names = {'tensorflow': ['tensorflow'],
 'tensorflow-gpu': ['tensorflow'],
 'tensorflow-probability': ['tensorflow_probability'],
 'tensorflow-addons': ['tensorflow_addons'],
 'tensorflow-text': ['tensorflow_text'],
 'streamlit': ['steamlit'],
 'tensorflow-cpu': ['tensorflow'],
}

for name in tf_df["dependencies"].value_counts().index:
    if name not in tf_import_names:
        tf_import_names[name] = get_import_name(name)


In [None]:
tf_import_names['dnnv'] = 'dnnv'

In [None]:
pt_df = pd.read_csv("data/PyTorch/PyTorch_sampled.csv")
pt_import_names = {'torch': ['torch'],
 'torchvision': ['torchvision'],
 'pytorch-lightning': ['pytorch_lightning'],
 'torchtext': ['torchtext'],
 'fastai': ['fastai'],
 'torchaudio': ['torchaudio'],
}

for name in pt_df["dependencies"].value_counts().index:
    if name not in pt_import_names:
        pt_import_names[name] = get_import_name(name)

In [None]:
import zipfile
import tarfile
import ast

def read_dist_file(filepath: str):
    if filepath.endswith(".whl") or filepath.endswith(".egg"):
        zipf = zipfile.ZipFile(filepath)
        files = {name: zipf.open(name).read().decode("utf-8") for name in zipf.namelist() if name.endswith(".py")}
        return files
    elif filepath.endswith(".tar.gz"):
        tarf = tarfile.open(filepath)
        files = {name: tarf.extractfile(name).read().decode("utf-8") for name in tarf.getnames() if name.endswith(".py")}
        return files
    return {}
    
def parse_imports(import_names: list, file: str):
    f_ast = ast.parse(file)
    res = []
    for node in ast.walk(f_ast):
        if isinstance(node, ast.ImportFrom):
            module = node.module
            if not node.module:
                continue
            module = module.split(".")[0]
            if module in import_names:
                res.append(node.lineno)
        elif isinstance(node, ast.Import):
            module_names = [alias.name for alias in node.names]
            if set(module_names).intersection(set(import_names)):
                res.append(node.lineno)
    return tuple(res)

def parse_dist(x, sc: str):
    if sc == "TensorFlow":
        import_names = tf_import_names
    elif sc == "PyTorch":
        import_names = pt_import_names

    dependencies = x["dependencies"]
    res = []
    if x["filepath"]:
        filepath = f"dists/{sc}/{x['filepath']}"
        files = read_dist_file(filepath)
        for file_name, file_content in files.items():
            try:
                linenos = parse_imports(import_names[dependencies], file_content)
                if linenos:
                    res.append((file_name, linenos))
            except Exception as e:
                print(f"Error: {filepath} {file_name}")
                print(e)

    return res

def parse_dists(sc: str):
    df = pd.read_csv(f"data/{sc}/{sc}_sampled.csv")
    for _, row in df.iterrows():
        res = parse_dist(row, sc)
        df.iloc[_, 4] = res

    return df

In [None]:
tf_df = parse_dists("TensorFlow")
tf_df.to_csv(f"data/TensorFlow/TensorFlow_sampled.csv", index=False)
pt_df = parse_dists("PyTorch")
pt_df.to_csv(f"data/PyTorch/PyTorch_sampled.csv", index=False)

In [50]:
tf_df = pd.read_csv("data/TensorFlow/TensorFlow_sampled.csv")
pt_df = pd.read_csv("data/PyTorch/PyTorch_sampled.csv")

print(len(tf_df["name"].unique()), 
      len(tf_df[tf_df["filepath"].notna()]["name"].unique()), 
      len(tf_df[tf_df["filepath"].notna() & (tf_df["imports"].str.len() > 2)]["name"].unique())
)

print(len(pt_df["name"].unique()), 
      len(pt_df[pt_df["filepath"].notna()]["name"].unique()), 
      len(pt_df[pt_df["filepath"].notna() & (pt_df["imports"].str.len() > 2)]["name"].unique())
)

334 327 305
344 338 334
