In [1]:
from pathlib import Path

import polars as pl

# Data dir
data_dir = (
    Path("../data/")
    .expanduser()
    .resolve()
    .absolute()
)

old_data = data_dir / "per-day-pkg-releases-metrics-one-year-old/"
new_data = data_dir / "per-day-pkg-releases-metrics-one-year.parquet"

old_data = pl.scan_parquet(old_data)
new_data = pl.scan_parquet(new_data)

# Perform the same filtering operations on both
# normalized_package_manager must be in pypi, npm, or rubygems
# release_type must not be unknown
# num_dependents_on_release_date must be greater than 1
old_data = old_data.filter(
    (pl.col("normalized_package_manager").is_in(["pypi", "npm", "rubygems"]))
    & (pl.col("release_type") != "unknown")
    & (pl.col("num_dependents_on_release_date") > 1)
)
new_data = new_data.filter(
    (pl.col("normalized_package_manager").is_in(["pypi", "npm", "rubygems"]))
    & (pl.col("release_type") != "unknown")
    & (pl.col("num_dependents_on_release_date") > 1)
)

# Compute number of unique packages
num_unique_packages_old = len(old_data.select("package_uuid").unique().collect())
print(f"Number of unique packages OLD: {num_unique_packages_old}")
num_unique_packages_new = len(new_data.select("package_uuid").unique().collect())
print(f"Number of unique packages NEW: {num_unique_packages_new}")
print(f"Difference: {num_unique_packages_new - num_unique_packages_old}")
print()

# Compute number of unique package release pairs
num_unique_package_release_pairs_old = len(
    old_data.select("package_uuid", "package_version")
    .unique(["package_uuid", "package_version"])
    .collect()
)
print(f"Number of unique package release pairs OLD: {num_unique_package_release_pairs_old}")
num_unique_package_release_pairs_new = len(
    new_data.select("package_uuid", "package_version")
    .unique(["package_uuid", "package_version"])
    .collect()
)
print(f"Number of unique package release pairs NEW: {num_unique_package_release_pairs_new}")
print(
    f"Difference: {num_unique_package_release_pairs_new - num_unique_package_release_pairs_old}"
)
print()

# Time range comparison
old_time_range = old_data.select(
    pl.col("publish_datetime").min().alias("min"),
    pl.col("publish_datetime").median().alias("median"),
    pl.col("publish_datetime").max().alias("max"),
).collect().to_dict(as_series=False)
new_time_range = new_data.select(
    pl.col("publish_datetime").min().alias("min"),
    pl.col("publish_datetime").median().alias("median"),
    pl.col("publish_datetime").max().alias("max"),
).collect().to_dict(as_series=False)
print("Time range comparison:")
print("OLD:", old_time_range)
print("NEW:", new_time_range)
print()

# Repo creation date comparison
old_repo_creation_date = old_data.select(
    pl.col("repo_created_date").str.to_date().min().alias("min"),
    pl.col("repo_created_date").str.to_date().median().alias("median"),
    pl.col("repo_created_date").str.to_date().max().alias("max"),
).collect().to_dict(as_series=False)
new_repo_creation_date = new_data.select(
    pl.col("repo_created_date").min().alias("min"),
    pl.col("repo_created_date").median().alias("median"),
    pl.col("repo_created_date").max().alias("max"),
).collect().to_dict(as_series=False)
print("Repo creation date comparison:")
print("OLD:", old_repo_creation_date)
print("NEW:", new_repo_creation_date)
print()

# num_dependents_on_release_date
old_num_dependents_on_release_date = old_data.select(
    pl.col("num_dependents_on_release_date").min().alias("min"),
    pl.col("num_dependents_on_release_date").quantile(0.25).alias("q25"),
    pl.col("num_dependents_on_release_date").median().alias("median"),
    pl.col("num_dependents_on_release_date").quantile(0.75).alias("q75"),
    pl.col("num_dependents_on_release_date").quantile(0.9).alias("q90"),
    pl.col("num_dependents_on_release_date").max().alias("max"),
).collect().to_dict(as_series=False)
new_num_dependents_on_release_date = new_data.select(
    pl.col("num_dependents_on_release_date").min().alias("min"),
    pl.col("num_dependents_on_release_date").quantile(0.25).alias("q25"),
    pl.col("num_dependents_on_release_date").median().alias("median"),
    pl.col("num_dependents_on_release_date").quantile(0.75).alias("q75"),
    pl.col("num_dependents_on_release_date").quantile(0.9).alias("q90"),
    pl.col("num_dependents_on_release_date").max().alias("max"),
).collect().to_dict(as_series=False)
print("Num dependents on release date comparison:")
print("OLD:", old_num_dependents_on_release_date)
print("NEW:", new_num_dependents_on_release_date)
print()

Number of unique packages OLD: 28044
Number of unique packages NEW: 49965
Difference: 21921

Number of unique package release pairs OLD: 202014
Number of unique package release pairs NEW: 521913
Difference: 319899

Time range comparison:
OLD: {'min': [datetime.datetime(2020, 2, 7, 0, 10, 43)], 'median': [datetime.datetime(2021, 12, 7, 8, 0)], 'max': [datetime.datetime(2023, 8, 1, 23, 58, 3)]}
NEW: {'min': [datetime.datetime(2020, 2, 7, 0, 0)], 'median': [datetime.datetime(2021, 12, 10, 5, 9, 53)], 'max': [datetime.datetime(2023, 8, 1, 23, 58, 16)]}

Repo creation date comparison:
OLD: {'min': [datetime.date(2008, 1, 19)], 'median': [datetime.datetime(2018, 3, 2, 0, 0)], 'max': [datetime.date(2023, 7, 24)]}
NEW: {'min': [datetime.date(2008, 1, 19)], 'median': [datetime.datetime(2018, 8, 30, 0, 0)], 'max': [datetime.date(2023, 7, 24)]}

Num dependents on release date comparison:
OLD: {'min': [2.0], 'q25': [3.0], 'median': [10.0], 'q75': [59.0], 'q90': [570.0], 'max': [1480016.0]}
NEW: {'