In [1]:
import polars as pl
from polars import col as c
from datetime import datetime

## Reproducing `DlyPrevDt`
## Sorted dataframe
Note that the dataframe is already sorted by `['DlyCalDt', 'PERMNO']`

In [2]:
df_crsp = pl.read_parquet('/home/hzhang/data/wrds/crsp_prevdt.parq')

In [3]:
## assert non NaN in any float columns
assert not any(df_crsp.select(pl.col(pl.Float64,pl.Float32)).select(pl.all().is_nan().any()).row(0)),  "There are NaN values in at least one float column."

### Full logic

`prev_cal_dt` is the *last*, *non-null* DlyPrc, up to 10 periods ago:
1. *last*: so the present value is filled forward, and then `shift`ed
2. *non-null*: thus the fill forward is only performed when `DlyPrc` is non-null (the `when` clause)
3. up to 10 periods: thus there is a limit on fill forward periods, of 10-1 periods due to the `shift`


In [4]:
max_missing_period = 10
df = (df_crsp
      .unique(subset=['DlyCalDt', 'PERMNO'], keep='first', maintain_order=True) # get rid of multiple rows of dividends
      .with_columns(
          prev_cal_dt = pl.when(c('DlyPrc').is_not_null()).then(c('DlyCalDt')).forward_fill(limit=max_missing_period-1).shift().over('PERMNO'))) #  

## Compare to CRSP value
1. Boundary effects: if the ground truth `DlyPrevDt` goes beyond dataframe's start_date, then it's OK not to match.
2. Only care when `DlyPrc` is not null
3. Use `ne_missing` so that `null` patterns match as well

I'm left with a very small number of rows of mismatch, out of 67M rows. 

Note that a simple fix by setting `prev_cal_dt` to null when smaller than `SecInfoStartDt` results in many more mismatches elsewhere.

In [5]:
min_date = df['DlyCalDt'].min()
with pl.Config(tbl_cols=-1):
    display(df.filter(
        (c('DlyPrevDt').ge(min_date) | c('DlyPrevDt').is_null()) &
        c("DlyPrc").is_not_null() &
        c('DlyPrevDt').ne_missing(c('prev_cal_dt')))
            )

PERMNO,SecInfoStartDt,SecInfoEndDt,HdrCUSIP,SecurityActiveFlg,Ticker,PERMCO,DlyCalDt,DlyPrc,DlyPrevDt,prev_cal_dt
f64,datetime[ns],datetime[ns],str,str,str,f64,datetime[ns],f64,datetime[ns],datetime[ns]
11178.0,1994-08-03 00:00:00,1994-09-08 00:00:00,"""90462840""","""Y""","""UNRC""",9026.0,1994-08-03 00:00:00,1.875,,1994-07-20 00:00:00
81014.0,1995-06-26 00:00:00,1997-04-22 00:00:00,"""89323B20""","""Y""","""TSRG""",13370.0,1995-06-26 00:00:00,1.0,,1995-06-16 00:00:00
76942.0,1995-09-18 00:00:00,1996-04-01 00:00:00,"""29087M30""","""Y""","""EMRL""",11091.0,1995-09-18 00:00:00,0.21875,,1995-09-13 00:00:00
10155.0,1996-08-22 00:00:00,1997-02-06 00:00:00,"""28485330""","""Y""","""ELGT""",8079.0,1996-08-22 00:00:00,1.125,,1996-08-09 00:00:00
88459.0,2008-04-28 00:00:00,2008-06-26 00:00:00,"""69511V20""","""Y""","""PACT""",37950.0,2008-04-28 00:00:00,0.9988,,2008-04-24 00:00:00
…,…,…,…,…,…,…,…,…,…,…
14675.0,2020-06-10 00:00:00,2020-06-18 00:00:00,"""87876P20""","""Y""","""TGEN""",54873.0,2020-06-10 00:00:00,0.75,,2020-06-08 00:00:00
21018.0,2023-10-18 00:00:00,2023-11-05 00:00:00,"""02369M10""","""Y""","""AMAO""",58077.0,2023-10-18 00:00:00,10.895,,2023-10-13 00:00:00
22798.0,2023-12-18 00:00:00,2024-04-21 00:00:00,"""83204U60""","""Y""","""SMFL""",59195.0,2023-12-18 00:00:00,0.6589,,2023-12-14 00:00:00
21859.0,2024-07-08 00:00:00,2024-12-26 00:00:00,"""G2263T12""","""Y""","""MITA""",58478.0,2024-07-08 00:00:00,11.035,,2024-07-03 00:00:00


In [6]:
df.shape

(67644099, 11)