In [1]:
import polars as pl
from polars import col as c
from datetime import datetime

## Reproducing `DlyPrevDt`
## Sorted dataframe
Note that the dataframe is already sorted by `['DlyCalDt', 'PERMNO']`

In [2]:
df_crsp = pl.read_parquet('/home/hzhang/data/wrds/crsp_prevdt.parq')

In [3]:
## assert non NaN in any float columns
assert not any(df_crsp.select(pl.col(pl.Float64,pl.Float32)).select(pl.all().is_nan().any()).row(0)),  "There are NaN values in at least one float column."

### Full logic

`prev_cal_dt` is the *last*, *non-null* DlyPrc, up to 10 periods ago:
1. *last*: so the present value is filled forward, and then `shift`ed
2. *non-null*: thus the fill forward is only performed when `DlyPrc` is non-null (the `when` clause)
3. up to 10 periods: thus there is a limit on fill forward periods, of 10-1 periods due to the `shift`


In [None]:
max_missing_period = 10
df = (df_crsp
      .unique(subset=['DlyCalDt', 'PERMNO'], keep='first', maintain_order=True) # get rid of multiple rows of dividends
      .with_columns(
          prev_cal_dt = pl.when(c('DlyPrc').is_not_null()).then(c('DlyCalDt')).forward_fill(limit=max_missing_period-1).shift().over('PERMNO'))) #  

## Compare to CRSP value
1. Boundary effects: if the ground truth `DlyPrevDt` goes beyond dataframe's start_date, then it's OK not to match.
2. Only care when `DlyPrc` is not null
3. Use `ne_missing` so that `null` patterns match as well

I'm left with a very small number of rows of mismatch, out of 67M rows. 

Note that a simple fix by setting `prev_cal_dt` to null when smaller than `SecInfoStartDt` results in many more mismatches elsewhere.

In [None]:
min_date = df['DlyCalDt'].min()
with pl.Config(tbl_cols=-1):
    display(df.filter(
        (c('DlyPrevDt').ge(min_date) | c('DlyPrevDt').is_null()) &
        c("DlyPrc").is_not_null() &
        c('DlyPrevDt').ne_missing(c('prev_cal_dt')))
            )

In [None]:
df.shape