# Motivation

In [None]:
from datetime import datetime

import pandas as pd

df = pd.DataFrame(
    {
        "date": [datetime(2020, 1, 1), datetime(2020, 1, 8), datetime(2020, 2, 3)],
        "price": [1, 4, 3],
    }
)
df

In [None]:
def monthly_aggregate_pandas(user_df):
    return user_df.resample("MS", on="date")[["price"]].mean()

monthly_aggregate_pandas(df)

# Dataframe-agnostic data science


## Bad solution: just convert to pandas

In [None]:
import duckdb
import polars as pl
import pyarrow as pa
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession


In [None]:
def monthly_aggregate_bad(user_df):
    if isinstance(user_df, pd.DataFrame):
        df = user_df
    elif isinstance(user_df, pl.DataFrame):
        df = user_df.to_pandas()
    elif isinstance(user_df, duckdb.DuckDBPyRelation):
        df = user_df.df()
    elif isinstance(user_df, pa.Table):
        df = user_df.to_pandas()
    elif isinstance(user_df, pyspark.sql.dataframe.DataFrame):
        df = user_df.toPandas()
    else:
        raise TypeError("Unsupported DataFrame type: cannot convert to pandas")

    return df.resample("MS", on="date")[["price"]].mean()


In [None]:
data = {
    "date": [datetime(2020, 1, 1), datetime(2020, 1, 8), datetime(2020, 2, 3)],
    "price": [1, 4, 3],
}

In [None]:
# pandas
pandas_df = pd.DataFrame(data)
monthly_aggregate_bad(pandas_df)

# polars
polars_df = pl.DataFrame(data)
monthly_aggregate_bad(polars_df)

# duckdb
duckdb_df = duckdb.from_df(pandas_df)
monthly_aggregate_bad(duckdb_df)

# pyspark
spark = SparkSession.builder.getOrCreate()
spark_df = spark.createDataFrame(pandas_df)
monthly_aggregate_bad(spark_df)

# pyarrow
arrow_table = pa.table(data)
monthly_aggregate_bad(arrow_table)

## Unmaintainable solution: different branches for each library

In [None]:
def monthly_aggregate_unmaintainable(user_df):
    if isinstance(user_df, pd.DataFrame):
        result = user_df.resample("MS", on="date")[["price"]].mean()
    elif isinstance(user_df, pl.DataFrame):
        result = (
            user_df.group_by(pl.col("date").dt.truncate("1mo"))
            .agg(pl.col("price").mean())
            .sort("date")
        )
    elif isinstance(user_df, pyspark.sql.dataframe.DataFrame):
        result = (
            user_df.withColumn("date_month", F.date_trunc("month", F.col("date")))
            .groupBy("date_month")
            .agg(F.mean("price").alias("price_mean"))
            .orderBy("date_month")
        )
    # TODO: more branches for DuckDB, PyArrow, Dask, etc... :sob:
    return result


In [None]:
# pandas
monthly_aggregate_unmaintainable(pandas_df)

# polars
monthly_aggregate_unmaintainable(polars_df)

# pyspark
monthly_aggregate_unmaintainable(spark_df)

## Best solution: Narwhals as a unified dataframe interface

In [None]:
import narwhals as nw
from narwhals.typing import IntoFrameT


def monthly_aggregate(user_df: IntoFrameT) -> IntoFrameT:
    return (
        nw.from_native(user_df)
        .group_by(nw.col("date").dt.truncate("1mo"))
        .agg(nw.col("price").mean())
        .sort("date")
        .to_native()
    )


In [None]:
# pandas
monthly_aggregate(pandas_df)

# polars
monthly_aggregate(polars_df)

# duckdb
monthly_aggregate(duckdb_df)

# pyarrow
monthly_aggregate(arrow_table)

# pyspark
monthly_aggregate(spark_df)

## Bonus - can we generate SQL?

In [None]:
from sqlframe.duckdb import DuckDBSession

sqlframe = DuckDBSession()
sqlframe_df = sqlframe.createDataFrame(pandas_df)
sqlframe_result = monthly_aggregate(sqlframe_df)
print(sqlframe_result.sql(dialect="databricks"))