In [2]:
import functools
from typing import Annotated, TypeAlias

import torch
from pydantic import AfterValidator, BaseModel, ConfigDict, Field, TypeAdapter, validate_call

# Validating Complex Types Using Pydantic

# Introduction

## Complex Types

- `dict`
- `torch.Tensor`
- `pd.DataFrame`
- `np.ndarray`
- ...

## Complex Type Hints and Readability

### Pandas

```python
def get_sale_items(items: pd.DataFrame) -> pd.DataFrame: ...

def create_dataframe_of_scores(avg_scores: list, prec_k: list, recall_k: list, k) -> pd.DataFrame: ...

def graph_supporter_breakdown(df: pd.DataFrame, company: str): ...
```

### Torch

```python
def normalize_ensemble(ensemble: torch.Tensor) -> torch.Tensor: ...

def pooler(tensor: torch.Tensor) -> torch.Tensor: ...

def reducer(tensor: torch.Tensor) -> torch.Tensor: ...
```

# Readability Improvements

## Type Alias

```python
# python <=3.11
Items: TypeAlias = pd.DataFrame

# python 3.12+
type Items_2 = pd.DataFrame
```

```python
Items: TypeAlias = pd.DataFrame

def get_sale_items(items: Items) -> Items: ...

ModelEnsemble: TypeAlias = torch.Tensor
Model: TypeAlias = torch.Tensor

def normalize_ensemble(ensemble: ModelEnsemble) -> Model: ...
```

## Add Comments To Alias

In [2]:
ModelEnsemble: TypeAlias = torch.Tensor
"""Has shape of (x, n, m), where: x = number of models, n = number of model elements, m = embedding count"""
Model: TypeAlias = torch.Tensor
"""Has shape of (n, m), where: n = number of model elements, m = embedding count"""
ElementScore: TypeAlias = torch.Tensor
"""Has shape of (n), where: n = number of model elements"""


def normalize_ensemble(ensemble: ModelEnsemble) -> Model: ...
def pooler(tensor: Model) -> Model: ...
def reducer(tensor: Model) -> ElementScore: ...

## `Annotated` Type

```python
Quantities: TypeAlias = Annotated[pd.Series, {"shape": (2,), "dtype": "int64", "name": "quantitiy"}]

# Putting Annotations to Work

## Pydantic Compatible Metadata

### Field

In [16]:
PositiveInt: TypeAlias = Annotated[int, Field(gt=0)]

### AfterValidator

In [17]:
def even_str(data: str) -> str:
    if not len(data) % 2 == 0:
        raise ValueError(f"String length must be even, length: {len(data)}.")
    return data


EvenStr: TypeAlias = Annotated[str, AfterValidator(even_str)]
ValidatedTuple: TypeAlias = tuple[PositiveInt, EvenStr]

## Pydantic Validation Objects

In [18]:
bad_data = (-1, "odd")
good_data = (1, "even")

### `BaseModel`

In [6]:
class ModelData(BaseModel):
    count: PositiveInt
    name: EvenStr

model_data = ModelData(count=good_data[0], name=good_data[1])

In [None]:
bad_model_data = ModelData(count=bad_data[0], name=bad_data[1])

### `validate_call`

In [11]:
@validate_call(validate_return=True)
def process_data(data: ValidatedTuple, duplicates: bool = True) -> bool:
    if duplicates:
        return False

is_data_processed = process_data(data=good_data)

In [None]:
bad_return = process_data(data=good_data, duplicates=False)

### `TypeAdapter`

In [14]:
tuple_adapter = TypeAdapter(ValidatedTuple)

validated_good_data = tuple_adapter.validate_python(good_data)

In [None]:
adapter_bad_data = tuple_adapter.validate_python(bad_data)

# Using Pydantic on Data Science Types

In [None]:
def tensor_shape(data: torch.Tensor, shape=tuple[int, ...]) -> torch.Tensor:
    if data.shape != shape:
        raise ValueError(f"Tensor must be of size {shape}, current size: {data.shape}.")
    return data


ensemble_shape = functools.partial(tensor_shape, shape=(3, 2, 4,))
EnsembleModel: TypeAlias = Annotated[torch.Tensor, AfterValidator(ensemble_shape)]


@validate_call(config=ConfigDict(arbitrary_types_allowed=True), validate_return=True)
def test() -> EnsembleModel:
    return torch.zeros(7, 6)

## Hassles

- `ConfigDict` needed to be added to `validate_call`.
- Had to use partial to take a generic function and make it specific to a type
- EnsembleModel `TypeAlias` isn't very readable

## `ds_validator`

In [4]:
from ds_validator import ds_type_adapter
from ds_validator.torch import tensor_dtype, tensor_shape, tensor_device

Weights: TypeAlias = Annotated[
    torch.Tensor,
    tensor_shape(shape=(3, None, range(1, 5), "x", "x")),
    tensor_dtype(data_type=torch.int64),
    tensor_device(device=torch.device("cpu"), match_index=False),
]
weight_adapter = ds_type_adapter(Weights)

good_weights = weight_adapter.validate_python(torch.zeros(3, 2, 4, 5, 5, dtype=torch.int64))

In [None]:
bad_weights = weight_adapter.validate_python(torch.zeros(2, 1, 6, 4, 3))

## Create a Custom Validator

In [8]:
from ds_validator import create_after_validator, create_checker

def tensor_dimensions_error_finder(data: torch.Tensor, *, dimensions: int) -> list[str]:
    if data.dim() != dimensions:
        return [
            f"Tensor doesn't match required dimensions {dimensions}, data dimensions: {data.dim()}."
        ]
    return []

tensor_dimensions_checker = create_checker("torch_tensor_dimensions_error", tensor_dimensions_error_finder)
tensor_dimensions = create_after_validator(tensor_dimensions_checker)

CustomWeights: TypeAdapter = Annotated[torch.Tensor, tensor_dimensions(dimensions=3)]
custom_weights_adapter = ds_type_adapter(CustomWeights)

good_custom_weights = custom_weights_adapter.validate_python(torch.zeros(1, 1, 1))

In [None]:
bad_custom_weights = custom_weights_adapter.validate_python(torch.zeros(1,2))