In [0]:
try:
    %reload_ext autotime
except:
    %pip install -U ipython-autotime ipywidgets codetiming openpyxl
    import IPython
    IPython.display.clear_output()
%reload_ext autotime
import pathlib, shutil, warnings, dataclasses, numpy as np, pandas as pd
from codetiming import Timer
seed = 42
catalog = 'dev.bronze.saturn'
root = pathlib.Path('/Workspace/Users/scook@tarleton.edu/admitted_matriculation_predictor_2025/')

##########################################
############ helper functions ############
##########################################
pd.options.display.max_columns = None
def disp(df, rows=4, head=True):
    with pd.option_context('display.min_rows', rows, 'display.max_rows', rows):
        display(df.head(rows) if head else df.tails(rows))

def to_numeric(df, downcast='integer', errors='ignore', **kwargs):
    """convert to numeric dtypes if possible"""
    return (
        df
        .apply(lambda s: s.astype('string').str.lower().str.strip() if s.dtype in ['object','string'] else s)  # prep strings
        .apply(lambda s: s if pd.api.types.is_datetime64_any_dtype(s) else pd.to_numeric(s, downcast=downcast, errors=errors, **kwargs))  # convert to numeric if possible
        .convert_dtypes()  # convert to new nullable dtypes
    )

def prep(df, **kwargs):
    h = lambda x: x.to_numeric(**kwargs).rename(columns=lambda s: s.lower().strip().replace(' ','_').replace('-','_') if isinstance(s, str) else s)
    idx = h(df[[]].reset_index())  # drop columns, reset_index to move index to columns, then apply g
    return h(df).reset_index(drop=True).set_index(pd.MultiIndex.from_frame(idx))  # set idx back to df's index

def wrap(fcn):
    """Make new methods work for Series and DataFrames"""
    def wrapper(S, *args, **kwargs):
        df = fcn(pd.DataFrame(S), *args, **kwargs)
        return None if df is None else df.squeeze() if isinstance(S, pd.Series) else df  # squeeze to series if input was series
    return wrapper

for f in [disp, to_numeric, prep]:
    """monkey-patch my helpers into Pandas Series & DataFrame classees so we can use df.method syntax"""
    setattr(pd.DataFrame, f.__name__, f)
    setattr(pd.Series, f.__name__, wrap(f))

def run(qry):
    return spark.sql(qry).toPandas().prep()

# annoying warnings to suppress
[warnings.filterwarnings(action='ignore', message=f".*{w}.*") for w in [
    "Could not infer format, so each element will be parsed individually, falling back to `dateutil`",
    "Engine has switched to 'python' because numexpr does not support extension array dtypes",
    "The default of observed=False is deprecated and will be changed to True in a future version of pandas",
    "errors='ignore' is deprecated and will raise in a future version",
    "The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
    "The behavior of array concatenation with empty entries is deprecated",
    "DataFrame is highly fragmented",
]]

#######################################################
############ process flags reports archive ############
#######################################################
@Timer()
def process_flags(overwrite=False):
    # Get id-pidm crosswalk so we can replace id by pidm in flags below
    source = pathlib.Path('/Volumes/aiml/scook/scook_files/admitted_flags_raw')
    target = root / 'flags'
    if overwrite:
        shutil.rmtree(target, ignore_errors=True) # if you want a fresh restart
    spriden = None
    counter = 0
    for src in sorted(source.iterdir(), reverse=True):
        counter += 1
        if counter > 5:
            break
        a,b = src.name.lower().split('.')
        if b != 'xlsx' or 'melt' in a or 'admitted' not in a:
            print(a, 'SKIP')
            continue
        # Handles 2 naming conventions that were used at different times
        try:
            dt = pd.to_datetime(a[:10].replace('_','-'))
            multi = True
        except:
            try:
                dt = pd.to_datetime(a[-6:])
                multi = False
            except:
                print(a, 'FAIL')
                continue
        print(a, dt.date())
        book = pd.ExcelFile(src, engine='openpyxl')
        # Again, handles the 2 different versions with different sheet names
        if multi:
            sheets = {sheet:sheet for sheet in book.sheet_names if sheet.isnumeric() and int(sheet) % 100 in [1,6,8]}
        else:
            sheets = {a[:6]: book.sheet_names[0]}
        for term_code, sheet in sheets.items():
            trg = target / f'{term_code}/flg_{term_code}_{dt.date()}.parquet'  # target parquet file
            if not trg.exists():
                counter = 0
                if spriden is None:
                    qry = f"""
                    select distinct
                        spriden_id as id,
                        spriden_pidm as pidm
                    from
                        {catalog}spriden
                    where
                        spriden_change_ind is null
                        and spriden_activity_date between '2000-09-01' and '2025-09-01'
                        and spriden_id REGEXP '^[0-9]+'
                    """
                    spriden = spark.sql(qry).toPandas().prep()

                print(trg)
                trg.parent.mkdir(parents=True, exist_ok=True)
                df = (
                    spriden
                    .assign(current_date=dt)
                    .merge(book.parse(sheet).prep(), on='id', how='right')
                    .drop(columns=['id','last_name','first_name','mi','pref_fname','street1','street2','primary_phone','call_em_all','email'], errors='ignore')
                )
                df.to_parquet(trg)
                mask = df['pidm'].isnull()
                if mask.any():
                    df[mask].disp()

# process_flags()

In [0]:
#######################################################
############ process flags reports archive ############
#######################################################
@dataclasses.dataclass
class MyBaseClass():
    """Lets us access object attributes using self.attr or self['attr']"""
    def __contains__(self, key):
        return hasattr(self, key)
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __delitem__(self, key):
        if key in self:
            delattr(self, key)

    def get(self, fcn, nm, overwrite=False, path=''):
        parq = root / f'path/{nm}.parquet'
        if overwrite:
            del self[nm]
            parq.unlink(missing_ok=True)
        if nm in self:
            print('exists')
        elif parq.exists():
            print('parq')
            self[nm] = pd.read_parquet(parq)
        else:
            print(f'creating {parq.name}')
            with Timer():
                parq.parent.mkdir(parents=True, exist_ok=True)
                self[nm] = fcn().prep()
                self[nm].to_parquet(parq)
        return self[nm]

@dataclasses.dataclass
class Terms(MyBaseClass):
    term_code: int = 202408
    cycle_day: int = None
    cycle_date: pd.Timestamp = None
    tz: str = 'America/Chicago'

    def __post_init__(self):
        self.get_terms()
        self.stable_date = self.terms.loc[self.term_code,'stable_date']
        if self.cycle_day is None:
            if self.cycle_date is None:
                self.cycle_date = pd.Timestamp.now()
            self.cycle_date = pd.to_datetime(self.cycle_date).normalize()
            self.cycle_day = (self.stable_date - self.cycle_date).days
        self.cycle_date = self.stable_date - pd.Timedelta(days=self.cycle_day)


    def get_terms(self, overwrite=False):
        def fcn():
            qry = f"""
            select
                A.stvterm_code as term_code,
                replace(A.stvterm_desc, ' ', '') as term_desc,
                A.stvterm_start_date as start_date,
                A.stvterm_end_date as end_date,
                A.stvterm_fa_proc_yr as fa_proc_yr,
                A.stvterm_housing_start_date as housing_start_date,
                A.stvterm_housing_end_date as housing_end_date,
                B.sobptrm_census_date as census_date
            from {catalog}stvterm A, {catalog}sobptrm B
            where A.stvterm_code = B.sobptrm_term_code and B.sobptrm_ptrm_code='1'"""
            df = run(qry).set_index('term_code')
            df['stable_date'] = df['census_date'].apply(lambda x: x+pd.Timedelta(days=7+4-x.weekday()))
            return df
        return self.get(fcn, 'terms', overwrite)
    
    # def get_

self = Terms(term_code=202508
             , cycle_day=1
            #  , cycle_date='2025-09-11'
            )
# self.get_terms(True)
# self = Terms()
# self.get_terms()
# self.get_terms()
self.cycle_date, self.stable_date, self.cycle_day, self.stable_date - pd.Timedelta(days=self.cycle_day)

In [0]:
self.cycle_date = pd.Timestamp.now()
# print(self.cycle_date)
# self.cycle_date = '2025-09-11'
self.cycle_date = pd.to_datetime(self.cycle_date).normalize()
print(self.cycle_date)


In [0]:
self.cycle_date = pd.Timestamp.now()
print(self.cycle_date)
self.cycle_date = pd.to_datetime(self.cycle_date).tz_localize(self.tz)#, utc=self.tz)
print(self.cycle_date)


In [0]:
x = '2025-09-11'
x = pd.to_datetime?

In [0]:
self.stable_date - pd.DateOffset(days=self.cycle_day)

In [0]:
self.stable_date - pd.DateOffset(days=self.cycle_day)
# - pd.Timedelta(days=self.cycle_day)
# pd.Timedelta(days=self.cycle_day)

In [0]:
x = pd.Timestamp.now(self.tz) - pd.DateOffset(days=1)
x

In [0]:
x = pd.Timestamp.now(self.tz)
x = self.stable_date - pd.DateOffset(days=1)
x
(self.stable_date - x).days

In [0]:
self.stable_date, self.cycle_day, self.cycle_date

In [0]:
(self.stable_date.tz_localize('America/Chicago') - pd.Timestamp.now('America/Chicago')).days

In [0]:
self.terms.loc[202408]#,'end_date']