In [None]:
import polars as pl

from rs_graph.db import constants as db_constants

###############################################################################

def _read_table(table: str) -> pl.DataFrame: 
    return pl.read_database_uri(
        f"SELECT * FROM {table}",
        f"sqlite:///{db_constants.DEV_DATABASE_FILEPATH}",
    )


def load_pairs() -> pl.DataFrame:
    # Read all the tables we need
    docs = _read_table("document")
    repos = _read_table("repository")
    pairs = _read_table("document_repository_link")

    # Drop to unique doc and unique repo in pairs
    pairs = pairs.unique(
        subset="document_id",
        keep="none",
    ).unique(
        subset="repository_id",
        keep="none",
    )

    # Join the tables to get the positive examples
    df = (
        pairs.select(
            "document_id",
            "repository_id",
        )
        .join(
            docs.select(
                pl.col("id").alias("document_id"),
                "publication_date",
            ),
            on="document_id",
            how="left",
        )
        .join(
            repos.select(
                pl.col("id").alias("repository_id"),
                "creation_datetime",
                "last_pushed_datetime",
            ),
            on="repository_id",
            how="left",
        )
    )

    return df

df = load_pairs()
df

Reading table: document from /Users/evamaxfield/active/rs-graph/rs_graph/data/files/rs-graph-v2-dev.db
Reading table: repository from /Users/evamaxfield/active/rs-graph/rs_graph/data/files/rs-graph-v2-dev.db
Reading table: document_repository_link from /Users/evamaxfield/active/rs-graph/rs_graph/data/files/rs-graph-v2-dev.db


document_id,repository_id,publication_date,creation_datetime,last_pushed_datetime
i64,i64,date,datetime[ns],datetime[ns]
1,1,2015-12-01,2014-05-05 21:14:57,2018-04-29 16:11:15
2,2,2022-10-01,2021-11-09 04:35:52,2021-11-23 04:14:31
3,3,2017-12-08,2017-07-10 14:22:18,2018-07-11 12:15:35
4,4,2020-05-18,2019-04-01 16:16:34,2021-07-12 10:28:10
5,5,2017-09-29,2016-12-13 22:27:50,2024-03-25 19:35:00
…,…,…,…,…
132706,126266,2021-01-01,2021-07-28 12:28:50,2021-09-14 20:54:07
132707,126267,2021-01-01,2021-10-13 03:48:38,2022-01-18 02:53:21
132708,126268,2021-01-01,2019-09-23 22:16:31,2025-06-02 02:50:30
132709,126269,2021-01-01,2021-03-06 07:54:01,2022-03-31 18:48:06


In [21]:
# Create columns for difference between publication date and creation date
# and publication date and last pushed date
df = df.with_columns(
    (
        pl.col("publication_date") - pl.col("creation_datetime")
    ).dt.total_days().abs().alias("publication_date_creation_date_diff"),
    (
        pl.col("publication_date") - pl.col("last_pushed_datetime")
    ).dt.total_days().abs().alias("publication_date_last_pushed_date_diff"),
)

# Also create a "years" column for both of these
df = df.with_columns(
    (
        pl.col("publication_date_creation_date_diff") / 365.25
    ).alias("publication_date_creation_date_diff_years"),
    (
        pl.col("publication_date_last_pushed_date_diff") / 365.25
    ).alias("publication_date_last_pushed_date_diff_years"),
)

df["publication_date_creation_date_diff"].describe(
    percentiles=[0.5, 0.75, 0.9, 0.95, 0.99],
).filter(
    pl.col("statistic").is_in(["mean", "std", "50%", "75%", "90%", "95%", "99%", "max"])
)

statistic,value
str,f64
"""mean""",250.566891
"""std""",346.853141
"""50%""",163.0
"""75%""",300.0
"""90%""",525.0
"""95%""",803.0
"""99%""",1811.0
"""max""",8280.0


In [22]:
df["publication_date_last_pushed_date_diff"].describe(
    percentiles=[0.5, 0.75, 0.9, 0.95, 0.99],
).filter(
    pl.col("statistic").is_in(["mean", "std", "50%", "75%", "90%", "95%", "99%", "max"])
)

statistic,value
str,f64
"""mean""",495.407902
"""std""",515.339448
"""50%""",335.0
"""75%""",676.0
"""90%""",1156.0
"""95%""",1546.0
"""99%""",2397.0
"""max""",8893.0


In [23]:
# Same for years
df["publication_date_creation_date_diff_years"].describe(
    percentiles=[0.5, 0.75, 0.9, 0.95, 0.99],
).filter(
    pl.col("statistic").is_in(["mean", "std", "50%", "75%", "90%", "95%", "99%", "max"])
)

statistic,value
str,f64
"""mean""",0.686015
"""std""",0.949632
"""50%""",0.44627
"""75%""",0.821355
"""90%""",1.437372
"""95%""",2.198494
"""99%""",4.958248
"""max""",22.669405


In [24]:
df["publication_date_last_pushed_date_diff_years"].describe(
    percentiles=[0.5, 0.75, 0.9, 0.95, 0.99],
).filter(
    pl.col("statistic").is_in(["mean", "std", "50%", "75%", "90%", "95%", "99%", "max"])
)

statistic,value
str,f64
"""mean""",1.356353
"""std""",1.410923
"""50%""",0.91718
"""75%""",1.850787
"""90%""",3.164956
"""95%""",4.232717
"""99%""",6.562628
"""max""",24.347707
