In [0]:
username = 'scook'
from IPython.display import clear_output
try:
    %reload_ext autotime
except:
    %pip install -U ipython-autotime ipywidgets codetiming Jinja2 numpy pandas pyarrow
    dbutils.library.restartPython()
    clear_output()
    dbutils.notebook.exit('Rerun to use newly installed/updated packages')

import os, sys, copy, pathlib, shutil, pickle, warnings, requests, dataclasses, time, codetiming, numpy as np, pandas as pd
clear_output()
pd.options.display.max_columns = None
now = pd.Timestamp.now()
root = pathlib.Path('/Volumes/aiml/scook/scook_files/transcript_ocr/')

############ helper functions ############
def dt(*args):
    return pd.to_datetime(args).dropna().min().normalize()

def setmeth(cls, fcn):
    """monkey-patch new method into a mutable class (fails for immutable class)"""
    setattr(cls, fcn.__name__, fcn)

def listify(*args, sort=False, reverse=False):
    """ensure it is a list"""
    if len(args)==1:
        if args[0] is None or args[0] is np.nan or args[0] is pd.NA:
            return list()
        elif isinstance(args[0], str):
            return [args[0]]
    try:
        L = list(*args)
    except Exception as e:
        L = list(args)
    if sort:
        try:
            L = sorted(L, reverse=reverse) 
        except Exception as e:
            pass
    return L

def setify(*args):
    """ensure it is a set"""
    return set(listify(*args))

def unpack(*args, **kwargs):
    L = [y for x in args for y in (unpack(*x) if isinstance(x, (list,tuple,set)) else listify(x))]
    return listify(L, **kwargs)

def unique(*args, **kwargs):
    L = dict.fromkeys(unpack(*args))
    return listify(L, **kwargs)

def difference(A, B, **kwargs):
    return unique([x for x in listify(A) if x not in listify(B)], **kwargs)

def rjust(x, width, fillchar=' '):
    return str(x).rjust(width,str(fillchar))

def ljust(x, width, fillchar=' '):
    return str(x).ljust(width,str(fillchar))

def join(lst, sep='\n,', pre='', post=''):
    """flexible way to join list of strings into a single string"""
    return f"{pre}{str(sep).join(map(str,listify(lst)))}{post}"

def alias(dct):
    """convert dict of original column name:new column name into list"""
    return [f'{k} as {v}' for k,v in dct.items()]

def indent(x, lev=1):
    return x.replace('\n','\n'+tab*lev) if lev>0 else x

def subqry(qry, lev=1):
    """make qry into subquery"""
    qry = '\n' + qry.strip()
    qry = '(' + qry + '\n)' if 'select' in qry else qry
    return indent(qry, lev)

def run(qry, show=False, sample='10 rows', seed=42):
    """run qry and return dataframe"""
    L = qry.split(' ')
    if len(L) == 1:
        qry = f'select * from {catalog}{L[0]}'
        if sample is not None:
            qry += f' tablesample ({sample}) repeatable ({seed})'
    if show:
        print(qry)
    return spark.sql(qry).toPandas().prep().sort_index()


############ pandas functions ############
# def disp(X, max_rows=3, precision=None, sort=False, **props):
def disp(X, max_rows=3, sort=False):
    """convenient display method"""
    print(X.shape)
    X = (X.sort_index(axis=1) if sort else X).reset_index()
    Y = pd.DataFrame({'dtype':X.dtypes.astype('string'), 'missing_pct':X.isnull().mean()*100}).T.rename_axis('column').reset_index().prep(case='')
    display(Y)
    display(X.head(max_rows))
    # props = {
    #     'text-align': 'center',
    #     'vertical-align': 'top',
    #     'border': '1px dotted black',
    #     'width': 'auto',
    #     'font-size': '16px',
    #     } | props
    # fmt = {'precision': precision, 'hyperlinks': 'html'}
    # display(X.head(max_rows).reset_index())
    # display(X)
    # return display(X.head(max_rows).style
    #     .format(**fmt)
    #     .format_index(**fmt, axis=0)
    #     .format_index(**fmt, axis=1)
    #     .set_table_styles([{'selector':k, 'props':[*props.items()]} for k in ['th','td']])
    #     .set_table_attributes('style="border-collapse: collapse"')
    # )

def to_numeric(df, case='lower', downcast='integer', errors='ignore', category=False, **kwargs):
    """convert to numeric dtypes if possible"""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        case = case if case in dir(pd.Series().str) else 'strip'
        return (
            df
            .apply(lambda s: getattr(s.astype('string').str.strip().str,case)() if s.dtype in ['object','string'] else s)  # prep strings
            .apply(lambda s: s if pd.api.types.is_datetime64_any_dtype(s) else pd.to_numeric(s, downcast=downcast, errors=errors, **kwargs))  # convert to numeric if possible
            .convert_dtypes()  # convert to new nullable dtypes
            .apply(lambda s: s.astype('Int64') if pd.api.types.is_integer_dtype(s) else s.astype('category') if s.dtype=='string' and category else s)
        )

def prep(df, **kwargs):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        h = lambda x: x.to_numeric(**kwargs).rename(columns=lambda s: s.lower().strip().replace(' ','_').replace('-','_') if isinstance(s, str) else s)
        idx = h(df[[]].reset_index())  # drop columns, reset_index to move index to columns, then apply g
        return h(df).reset_index(drop=True).set_index(pd.MultiIndex.from_frame(idx))  # set idx back to df's index

def groupb(df, by=None, **kwargs):
    """my preferred defaults for groupby"""
    kwargs = {'axis':0,'level':None,'as_index':True,'sort':False,'group_keys':False,'observed':False,'dropna':False}|kwargs
    return df.groupby(by, **kwargs)

def get_incoming(df):
    return df.query("levl_code=='ug' & styp_code in ['n','r','t']")

def get_duplicates(df, subset='pidm', quit=True, rows=10):
    mask = df.groupb(subset, sort=True).transform('size') > 1
    if mask.any():
        df[mask].disp(rows)
        if quit:
            raise Exception(f'{mask.sum()} duplicates detected')
    return df[mask]

def get_missing(df, rows=-1):
    miss = df.isnull().mean()*100
    if miss.any():
        miss[miss>0].sort_values(ascending=False).round(1).disp(rows)
    return miss

def wrap(fcn):
    """Make new methods work for Series and DataFrames"""
    def wrapper(X, *args, **kwargs):
        df = fcn(pd.DataFrame(X), *args, **kwargs)
        return None if df is None else df.squeeze() if isinstance(X, pd.Series) else df  # squeeze to series if input was series
    wrapper.__name__ = fcn.__name__
    return wrapper

for fcn in [
    disp,
    to_numeric,
    prep,
    get_incoming,
    get_duplicates,
    get_missing,
    groupb,
    ]:
    """monkey-patch my helpers into Pandas Series & DataFrame classees so we can use df.method syntax"""
    setmeth(pd.DataFrame, fcn)
    setmeth(pd.Series, wrap(fcn))

############ file i/o functions ############
def get_size(path):
    os.system(f'du -h {path}')

def rm(path, root=False):
    path = pathlib.Path(path)
    if path.is_file():
        path.unlink()
    elif path.is_dir():
        if root:
            shutil.rmtree(path)
        else:
            for p in path.iterdir():
                rm(p, True)
    return path

def mkdir(path):
    path = pathlib.Path(path)
    (path if path.suffix == '' else path.parent).mkdir(parents=True, exist_ok=True)
    return path

def reset(path):
    rm(path)
    mkdir(path)
    return path

def prepr(X):
    if isinstance(X, (pd.DataFrame,pd.Series)):
        return X.prep()
    elif isinstance(X, dict):
        return {k: prepr(v) for k, v in X.items()}
    elif isinstance(X, (list,tuple,set)):
        return type(X)(prepr(v) for v in X)
    else:
        return X

def dump(path, obj):
    path = reset(path)
    obj = prepr(obj)
    if path.suffix == '.parquet':
        pd.DataFrame(obj).to_parquet(path)  # forced to wrap with explicit pd.DataFrame to due strange error under pandas 2.2.3 "Object of type PlanMetrics is not JSON serializable" with to_parquet
    elif path.suffix == '.csv':
        pd.DataFrame(obj).to_csv(path)
    else:
        with open(path, 'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    return obj

def load(path):
    path = pathlib.Path(path)
    if path.suffix == '.parquet':
        return pd.read_parquet(path)
    elif path.suffix == '.csv':
        return pd.read_csv(path)
    else:
        with open(path, 'rb') as f:
            return pickle.load(f)

In [0]:
for src in root.iterdir():
    print(src)
    df = load(src).prep()
    df.disp(100)

In [0]:
df['ay'] = df['attended_term'].str[:4].prep()
df['level'] = df.groupby('document_id')['ay'].transform('rank', method='dense')

In [0]:
df.disp(100)

In [0]:
crse = 'alg1'
df.value_counts('sem_1').sort_index().disp(-1)