In [0]:
# x
from test_cook import *
x

In [0]:
try:
    %reload_ext autotime
except:
    %pip install ipython-autotime ipywidgets openpyxl
    import IPython
    IPython.display.clear_output()
%reload_ext autotime

import pathlib, shutil, warnings, numpy as np, pandas as pd
seed = 42
root = pathlib.Path('/Volumes/aiml')
source = root / 'scook/scook_files/admitted_flags_raw'
target = root / 'flags/flags_volume'
# shutil.rmtree(target, ignore_errors=True) # if you want a fresh restart

############ helper functions I use in many project ############
pd.options.display.max_columns = None
[warnings.filterwarnings(action='ignore', message=f".*{w}.*") for w in [
    "Could not infer format, so each element will be parsed individually, falling back to `dateutil`",
    "Engine has switched to 'python' because numexpr does not support extension array dtypes",
    "The default of observed=False is deprecated and will be changed to True in a future version of pandas",
    "errors='ignore' is deprecated and will raise in a future version",
    "The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
    "The behavior of array concatenation with empty entries is deprecated",
    "DataFrame is highly fragmented",
]]

def disp(df, rows=4, head=True):
    with pd.option_context('display.min_rows', rows, 'display.max_rows', rows):
        display(df.head(rows) if head else df.tails(rows))

def to_numeric(df, downcast='integer', errors='ignore', **kwargs):
    """convert to numeric dtypes if possible"""
    return (
        df
        .apply(lambda s: s.astype('string').str.lower().str.strip() if s.dtype in ['object','string'] else s)  # prep strings
        .apply(lambda s: s if pd.api.types.is_datetime64_any_dtype(s) else pd.to_numeric(s, downcast=downcast, errors=errors, **kwargs))  # convert to numeric if possible
        .convert_dtypes()  # convert to new nullable dtypes
    )

def prep(df, **kwargs):
    h = lambda x: x.to_numeric(**kwargs).rename(columns=lambda s: s.lower().strip().replace(' ','_').replace('-','_') if isinstance(s, str) else s)
    idx = h(df[[]].reset_index())  # drop columns, reset_index to move index to columns, then apply g
    return h(df).reset_index(drop=True).set_index(pd.MultiIndex.from_frame(idx))  # set idx back to df's index

def wrap(fcn):
    """Make new methods work for Series and DataFrames"""
    def wrapper(S, *args, **kwargs):
        df = fcn(pd.DataFrame(S), *args, **kwargs)
        return None if df is None else df.squeeze() if isinstance(S, pd.Series) else df  # squeeze to series if input was series
    return wrapper

for f in [disp, to_numeric, prep]:
    """monkey-patch my helpers into Pandas Series & DataFrame classees so we can use df.method syntax"""
    setattr(pd.DataFrame, f.__name__, f)
    setattr(pd.Series, f.__name__, wrap(f))

In [0]:
# Get id-pidm crosswalk so we can replace id by pidm in flags below
query = f"""
select distinct
    spriden_id as id,
    spriden_pidm as pidm
from
    dev.bronze.saturnspriden
where
    spriden_change_ind is null
    and spriden_activity_date between '2000-09-01' and '2025-09-01'
    and spriden_id REGEXP '^[0-9]+'
"""
spriden = spark.sql(query).toPandas()#.prep()
spriden

In [0]:
for src in sorted(source.iterdir(), reverse=True):
    a,b = src.name.lower().split('.')
    if b != 'xlsx' or 'melt' in a or 'admitted' not in a:
        print(a, 'SKIP')
        continue
    # Handles 2 naming conventions that were used at different times
    try:
        dt = pd.to_datetime(a[:10].replace('_','-'))
        multi = True
    except:
        try:
            dt = pd.to_datetime(a[-6:])
            multi = False
        except:
            print(a, 'FAIL')
            continue
    # if str(dt) > "2022-08-01":
    #     continue
    print(a, dt.date())
    book = pd.ExcelFile(src, engine='openpyxl')
    # Again, handles the 2 different versions with different sheet names
    if multi:
        sheets = {sheet:sheet for sheet in book.sheet_names if sheet.isnumeric() and int(sheet) % 100 in [1,6,8]}
    else:
        sheets = {a[:6]: book.sheet_names[0]}
    for term_code, sheet in sheets.items():
        trg = target / f'{term_code}/flg_{term_code}_{dt.date()}.parquet'  # target parquet file
        if not trg.exists():
            print(trg)
            trg.parent.mkdir(parents=True, exist_ok=True)
            df = (
                spriden
                .assign(current_date=dt)
                .merge(book.parse(sheet).prep(), on='id', how='right')
                .drop(columns=['id','last_name','first_name','mi','pref_fname','street1','street2','primary_phone','call_em_all','email'], errors='ignore')
            )
            df.to_parquet(trg)
            mask = df['pidm'].isnull()
            if mask.any():
                df[mask].disp()
    # assert 1==2
