In [None]:
import polars as pl

In [None]:
ipums = pl.scan_parquet("data_ipums/ipums_samples.parquet")
print(ipums)
print(ipums.collect_schema())

In [None]:
ipums_small = ipums.head(100).collect()
ipums_small

In [None]:
# ipums_alabama = (
#     ipums.filter(pl.col("STATEFIP") == 1)
#     .sort(pl.col("COUNTYNHG"))
#     .with_columns(mean_occscore=pl.col("OCCSCORE").mean())
#     .collect()
# )
# ipums_alabama


ipums_alabama = ipums.filter(pl.col("STATEFIP") == 1)

ipums_alabama_with_mean_occscore = ipums_alabama.with_columns(
    mean_occscore=pl.col("OCCSCORE").mean()
)

ipums_alabama_with_mean_occscore.collect()


In [None]:
ipums.group_by("STATEFIP", "YEAR", maintain_order=True).agg(
    mean_occscore=pl.col("OCCSCORE").mean()
).collect()

In [None]:
ipums.collect()

In [None]:
with_text = ipums.select(text=pl.lit("Etienne")).collect()
with_text

In [None]:
with_text.with_columns(contains_e=pl.col("text").str.to_uppercase().str.contains("E"))

In [None]:
# Need separate with_columns() statements if we create several variables that
# depend on each other
ipums.group_by("STATEFIP", "YEAR", maintain_order=True).agg(
    mean_occscore=pl.col("OCCSCORE").mean()
).with_columns(a=1).with_columns(b=pl.col("a") + 1).collect()

In [None]:
# Use expressions in other contexts, such as filter()
ipums_alabama.filter(pl.col("OCCSCORE") >= pl.col("OCCSCORE").mean()).collect()

In [None]:
ipums_alabama.filter(
    (pl.col("OCCSCORE") >= pl.col("OCCSCORE").mean().over("YEAR"))
).collect()

In [None]:
ipums_alabama.filter(
    (pl.col("OCCSCORE") >= pl.col("OCCSCORE").mean().over("YEAR"))
).collect().write_parquet("data_ipums/alabama.parquet")

In [None]:
# Create a custom function to standardize numeric variables
def standardize(x) -> pl.Expr:
    # The "return" keywords has to be specified (contrarily to R)
    return ((x - x.mean()) / x.std()).over("YEAR")


ipums_alabama.with_columns(occscore_stand=standardize(pl.col("OCCSCORE"))).collect()

In [None]:
# Last resort if no function in polars library and cannot write your own function
# that returns a Polars expression: use map_batches() / map_elements()

ipums_alabama.with_columns(
    occscore_stand=pl.col("OCCSCORE")
    .map_batches(lambda x: (x - x.mean()) / x.std())
    .over("YEAR")
).collect()