In [None]:
# ===================================================================== #
# util/csvt -- save/load CSV files with pickled datatypes
#
# History
#  - 10/7/2022 created, jpb
# ===================================================================== #

In [1]:
import sys
import pandas as pd
from pathlib import Path

In [5]:
# Save CSV file, with pickled type info

def save_csv_dtype(df, fp_csv, fp_dtype=None, dtype_only=False):
    """
    Save CSV file with pickled type info

    Args:
     - df          -- pandas dataframe to save
     - fp_csv      -- path to save CSV data
     - fp_dtype    -- path to save pickled dtypes
                      (default = CSV stem + '-dtypes.pkl')
     - *dtype_only -- save just pickled dtypes (default = False)
    """
    if not dtype_only:
        df.to_csv(fp_csv, index=False)

    if fp_dtype is None:
        suffix   = ''.join(Path(fp_csv).suffixes)
        fp_dtype = str(fp_csv).removesuffix(suffix) + '-dtypes.pkl'

    df.dtypes.to_pickle(fp_dtype)

In [6]:
save_csv_dtype(df, r"C:\Users\alacy5\Documents\DT6\test", fp_dtype=None, dtype_only=False)

NameError: name 'df' is not defined

In [None]:
# Implementation function for load_csv_type: fixes cat and date types

def _clean_rdf(rdf, dtype, clean_cat, clean_date, verbose):
    for col in clean_cat:
        if verbose: print(f' - clean cat : {col}')
        v_str = [ _.replace("(","").replace("]", "").split(", ")
                  for _ in rdf[col] ]
        v_int = [ pd.Interval(float(i), float(j)) for i, j in v_str]

        rdf[col] = pd.Series(v_int).astype(dtype[col])

    for col in clean_date:
        if verbose: print(f' - clean date: {col}')
        rdf[col] = rdf[col].astype(dtype[col])

    return rdf

In [None]:
# Implemenation class for load_csv_type: iterates over file

class LCDIter:
    def __init__(self, df_iter, dtype, clean_cat, clean_date, verbose):
        self.df_iter    = df_iter
        self.dtype      = dtype
        self.clean_cat  = clean_cat
        self.clean_date = clean_date
        self.verbose    = verbose

    def __iter__(self):
        return self
    
    def __next__(self):
        rdf = self.df_iter.__next__()
        return _clean_rdf(rdf, self.dtype, self.clean_cat, self.clean_date,
                          self.verbose)

In [None]:
# Load CSV file, using pickled type info

def load_csv_dtype(
        fp_csv,
        fp_dtype=None,
        chunksize=None,
        low_memory=True,
        warn=True,
        verbose = False):
    """
    load CSV file, using pickled type info (from save_csv_types).

    Args
     - fp_csv      -- filepath to CSV file
     - *fp_dtype   -- filepath to dtype file.  Defaults to stem of CSV
                      file with "-dtypes.pkl" appended.
     - *chunksize  -- Iterate over file in chunksize chunks. Default=None
                      indicates load entire file. 
     - *low_memory -- passed to read_csv.  Honestly may not be necessary
                      since the whole idea is to use the correct types
                      from the dtypes.pkl file instead of inferring them.
     - *warn       -- Print warnings.  Default = False
     - *verbose    -- Be verbose.  Default = False

    Returns:
     Dataframe, or Dataframe iterator if chunksize set.
    """
    
    if fp_dtype is None:
        suffix   = ''.join(Path(fp_csv).suffixes)
        fp_dtype = str(fp_csv).removesuffix(suffix) + '-dtypes.pkl'
        
        # short circuit to read_csv
        if not Path(fp_dtype).exists():
            if warn:
                print(f'load_csv_dtype: unable to infer dtypes filename ({fp_dtype})')
            return pd.read_csv(fp_csv,
                               low_memory = low_memory,
                               chunksize  = chunksize)
    
    dtype = pd.read_pickle(fp_dtype)


    #
    # Find categorical and date types.  These require additional processing
    # after the initial read_csv
    #
    clean_cat  = []
    clean_date = []
    dtype_read = dtype.copy()
    for index, value in dtype_read.items():
        if (type(value) == pd.CategoricalDtype and
            type(value.categories[0]) == pd.Interval):
            dtype_read[index] = 'category'
            clean_cat.append(index)
        elif str(value) == 'datetime64[ns]':
            dtype_read[index] = 'object'
            clean_date.append(index)

    if chunksize is not None:
        try:
            df_iter = pd.read_csv(fp_csv, dtype=dtype_read.to_dict(),
                                  low_memory = low_memory,
                                  chunksize = chunksize)
        except:
            print(f'ERROR: during load_csv_dtype({fp_csv}, ...)',
                  file=sys.stderr)
            print(f'     : exception in pd.read_csv(w/chunksize)',
                  file=sys.stderr)
            raise
        return LCDIter(df_iter, dtype, clean_cat, clean_date, verbose)
    else:
        try:
            rdf = pd.read_csv(fp_csv, dtype=dtype_read.to_dict(),
                              low_memory = low_memory)
        except:
            print(f'ERROR: during load_csv_dtype({fp_csv}, ...)',
                  file=sys.stderr)
            print(f'     : exception in pd.read_csv',
                  file=sys.stderr)
            raise

        return _clean_rdf(rdf, dtype, clean_cat, clean_date, verbose)