## pandantic experiments

"simple" goal: pydantic schema for generating a dataframe



In [1]:
from inspect import getfullargspec
import pandas as pd
import typing
import json
from pydantic import BaseModel, create_model, validator, Field, ValidationError
import numpy as np
from enum import Enum

first, check out the instantiation args for a dataframe:

pd.DataFrame

In [2]:
pd.DataFrame?

[0;31mInit signature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0;34m'Axes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m'Axes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'Dtype | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m:[0m [0;34m'bool | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns).
Arithmetic operations align on both row and column labels. Can be
thought of as a dict-like container for Series objects. The primary
pandas data s

So some complexity here... 
1. the `data` arg can be many things and is not explicitly typed. it's validated within `DataFrame.__init__()`, but not typed because it can be so many things.
2. except for `copy`, the arguments are internal pandas types. We can check out the `Dtypes` and `Axes` types are with:


In [3]:
pd._typing.Axes

typing.Collection[typing.Any]

In [4]:
pd._typing.Dtype

typing.Union[ForwardRef('ExtensionDtype'), str, numpy.dtype, typing.Type[typing.Union[str, float, int, complex, bool, object]]]

### initial manual `DataFrameModel`

a simple attempt at building a pydantic model. Adding a `dtype` attribute is proving difficult... for now, we'll use a string declaration approach with `Enum`. So let's construct a `DtypeEnum` from a list of strings corresponding to data types that we'll allow. When we get to trying to instantiate a true pandas `DataFrame`, we'll use `eval()` to get an actual type. 

In [5]:
allowed_types = ['int', 'float', 'str', 'complex', 'np.int64', 'np.float64'] # not a complete list...
DtypeEnum = Enum("DtypeEnum", dict(zip(allowed_types, allowed_types)))

In [6]:
class DataFrameModel(BaseModel):
    data: dict # for simplicity for now, only allow data dict
    index: typing.Optional[pd._typing.Axes] = None
    columns: typing.Optional[pd._typing.Axes] = None    
    dtype: typing.Optional[DtypeEnum] = None
    copy_: typing.Optional[bool] = Field(None, alias='copy')
    
    class Config:
        arbitrary_types_allowed = True  ## needed for Axes type        

In [7]:
df = DataFrameModel.construct()

In [8]:
df.schema()

{'title': 'DataFrameModel',
 'type': 'object',
 'properties': {'data': {'title': 'Data', 'type': 'object'},
  'index': {'title': 'Index'},
  'columns': {'title': 'Columns'},
  'dtype': {'$ref': '#/definitions/DtypeEnum'},
  'copy': {'title': 'Copy', 'type': 'boolean'}},
 'required': ['data'],
 'definitions': {'DtypeEnum': {'title': 'DtypeEnum',
   'description': 'An enumeration.',
   'enum': ['int', 'float', 'str', 'complex', 'np.int64', 'np.float64']}}}

In [9]:
with open('test_schema.json', 'w') as fi:
    fi.write(df.schema_json())

In [10]:
DataFrameModel(data={"a":[1,2,3]}, dtype="complex")

DataFrameModel(data={'a': [1, 2, 3]}, index=None, columns=None, dtype=<DtypeEnum.complex: 'complex'>, copy_=None)

In [11]:
DataFrameModel(data={"a":[1,2,3]}, dtype="complex").json()

'{"data": {"a": [1, 2, 3]}, "index": null, "columns": null, "dtype": "complex", "copy_": null}'

## instiating a dataframe. 

Assuming we've used our schema above to write a json to `filled_schema.json`, let's actually instantiate a dataframe:

In [12]:
valid_model = DataFrameModel.parse_file('filled_schema.json')

In [13]:
valid_model

DataFrameModel(data={'col_1': [1, 2, 3, 4], 'col_2': [-1, 20, 30, -20]}, index=None, columns=None, dtype=<DtypeEnum.np.int64: 'np.int64'>, copy_=True)

in the yt analysis schema approach, we attached a `._run` attribute to the pydantic classes. but it may be clearer to have a separate ingestion process:

In [14]:
def instantiate_df(pandantic_model: DataFrameModel) -> pd.DataFrame:
    enum_dtype = pandantic_model.dtype # e.g., <DtypeEnum.int: 'int'>
    dtype_str = enum_dtype.value # e.g., 'int'
    actual_dtype = eval(dtype_str) # e.g., int 
    return pd.DataFrame(pandantic_model.data, 
                        index=pandantic_model.index, 
                        columns=pandantic_model.columns,
                        dtype=actual_dtype,
                        copy=pandantic_model.copy_
                       )

In [15]:
df = instantiate_df(valid_model)

In [16]:
df.head()

Unnamed: 0,col_1,col_2
0,1,-1
1,2,20
2,3,30
3,4,-20


### combining pydantic and inspect for dynamically generating a model?

In [17]:
pd_func = pd.DataFrame
df_args = getfullargspec(pd_func)

In [18]:
df_args.annotations

{'index': 'Axes | None',
 'columns': 'Axes | None',
 'dtype': 'Dtype | None',
 'copy': 'bool | None'}

In [19]:
df_args.args

['self', 'data', 'index', 'columns', 'dtype', 'copy']

In [20]:
df_args.defaults

(None, None, None, None, None)

In [21]:
from pydantic.utils import validate_field_name

class BaseDynamic(BaseModel):
    class Config:
        arbitrary_types_allowed = True          
        
missing_override_types = {
  pd.DataFrame : {'data': dict, 'dtype': DtypeEnum}   
}


def generate_model_dict(pd_func) -> dict:
    f_args = getfullargspec(pd_func)
    
    # work out how many args, kwargs there are
    argnames = f_args.args
    print("\nargument names:")
    print(argnames)
    if len(argnames)>1 and argnames[0]=='self':        
        argnames = argnames[1:]
        
    n_defaults = len(f_args.defaults)
    n_args = len(argnames)
    n_arg_only = n_args - n_defaults # number of args 
    n_kwargs = n_args - n_arg_only # number of kwargs
    
    default_dict = dict(zip(argnames[n_arg_only:], f_args.defaults))
    print("\ndefault values:")
    print(default_dict)
    
    base_types = ['int', 'bool', 'float', 'None']

    # get a dict of the types of each arg
    type_dict = {}
    for ky, typelist in df_args.annotations.items():

        ky_type = None
        for type_str in typelist.split("|"):
            type_str = type_str.strip()
            if hasattr(pd._typing, type_str):
                actual_type = getattr(pd._typing, type_str)
            elif type_str in base_types:
                actual_type = eval(type_str)        
            else:
                raise NameError(f"could not find {type_str}")

            if ky_type is None:
                ky_type = actual_type
            else:
                ky_type = typing.Union[ky_type, actual_type]

        type_dict[ky] = ky_type
        
    # set any missing types or overrides
    for arg in argnames:
        if pd_func in missing_override_types and arg in missing_override_types[pd_func]:
            type_dict[arg] = missing_override_types[pd_func][arg]

    print("\ntypes:")
    print(type_dict)
    
    # work out if we need an alias for any fields
    arg_aliases = {}
    for arg in argnames:
        try: 
            validate_field_name([BaseModel], arg)            
        except NameError:
            arg_aliases[arg] = arg+"_"
    print("\nfield aliases:")
    print(arg_aliases)
    
    # the final dictionary
    model_dict = {}
    for arg in argnames:
        default_value = default_dict[arg]
        types = type_dict[arg]
        if arg in arg_aliases:
            attname = arg_aliases[arg]
            field = Field(default=default_value,
                          alias=arg_aliases[arg])
        else:
            attname = arg
            field = Field(default=default_value)
        model_dict[attname] = (types, field)
        
    # return a pydantic model
    return create_model(pd_func.__name__+"Model", **model_dict, __base__=BaseDynamic)                                      

In [22]:
pd_model = generate_model_dict(pd_func)


argument names:
['self', 'data', 'index', 'columns', 'dtype', 'copy']

default values:
{'data': None, 'index': None, 'columns': None, 'dtype': None, 'copy': None}

types:
{'index': typing.Optional[typing.Collection[typing.Any]], 'columns': typing.Optional[typing.Collection[typing.Any]], 'dtype': <enum 'DtypeEnum'>, 'copy': typing.Optional[bool], 'data': <class 'dict'>}

field aliases:
{'copy': 'copy_'}


In [23]:
m = pd_model.construct()
m.schema()

{'title': 'DataFrameModel',
 'type': 'object',
 'properties': {'data': {'title': 'Data', 'type': 'object'},
  'index': {'title': 'Index'},
  'columns': {'title': 'Columns'},
  'dtype': {'$ref': '#/definitions/DtypeEnum'},
  'copy_': {'title': 'Copy ', 'type': 'boolean'}},
 'definitions': {'DtypeEnum': {'title': 'DtypeEnum',
   'description': 'An enumeration.',
   'enum': ['int', 'float', 'str', 'complex', 'np.int64', 'np.float64']}}}

In [24]:
pd_model(data={'a':[1,2,3,4]}, dtype='np.float64')

DataFrameModel(data={'a': [1, 2, 3, 4]}, index=None, columns=None, dtype=<DtypeEnum.np.float64: 'np.float64'>, copy_=None)

In [25]:
pd_model(data={'a':[1,2,3,4]}, dtype='np.float64').json()

'{"data": {"a": [1, 2, 3, 4]}, "index": null, "columns": null, "dtype": "np.float64", "copy_": null}'

### semi-automatic ingestion of the pandantic model?

In [26]:
def dtype_ingestor(enum_dtype):    
    dtype_str = enum_dtype.value # e.g., 'int'
    return eval(dtype_str) # e.g., int 
    
ingestor_funcs = {
  pd.DataFrame : {'dtype': dtype_ingestor}   
}


def auto_instantiator(pd_model, pd_funcname):    
    pd_func = getattr(pd, pd_funcname)
    f_args = getfullargspec(pd_func)
    
    # work out how many args, kwargs there are
    argnames = f_args.args    
    if len(argnames)>1 and argnames[0]=='self':        
        argnames = argnames[1:]
        
    n_defaults = len(f_args.defaults)
    n_args = len(argnames)
    n_arg_only = n_args - n_defaults # number of args 
    n_kwargs = n_args - n_arg_only # number of kwargs
    
    # work out if we need an alias for any fields
    arg_aliases = {}
    for arg in argnames:
        try: 
            validate_field_name([BaseModel], arg)            
        except NameError:
            arg_aliases[arg] = arg+"_"

    # build up the arguments for the pd function
    args = []
    kwargs = {}
    for iarg, arg in enumerate(argnames):
        
        if arg in arg_aliases:
            model_argname = arg_aliases[arg]
        else:
            model_argname = arg
        
        argval = getattr(pd_model, model_argname)
        if arg in ingestor_funcs[pd_func]:
            argval = ingestor_funcs[pd_func][arg](argval)
                    
        if iarg < n_arg_only:
            args.append(argval)
        else:
            kwargs[arg] = argval
            
    return pd_func(*args, **kwargs)
            
            

In [31]:
model_json = pd_model(data={'a':[1,2,3,4]}, dtype='np.float64').json()
model_json

'{"data": {"a": [1, 2, 3, 4]}, "index": null, "columns": null, "dtype": "np.float64", "copy_": null}'

In [34]:
df_model = pd_model.parse_raw(model_json)
df_model

DataFrameModel(data={'a': [1, 2, 3, 4]}, index=None, columns=None, dtype=<DtypeEnum.np.float64: 'np.float64'>, copy_=None)

In [28]:
df = auto_instantiator(df_model, 'DataFrame')

In [29]:
type(df)

pandas.core.frame.DataFrame

In [30]:
df

Unnamed: 0,a
0,1.0
1,2.0
2,3.0
3,4.0
