# 📇 Schemas

## Avro

In [None]:
# Avro schema from Pandas Dataframe

import json

import pandas as pd
import pandavro as pda
from sklearn import datasets

# load iris dataset from sklearn
iris = datasets.load_iris()

# load iris dataset as pandas dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
# use pandavro schema inference for quick schema outline
schema = pda.__schema_infer(df=df, times_as_micros=True)

# indent from json dumps helps make formatting more legible
print(json.dumps(schema, indent=4))

{
    "type": "record",
    "name": "Root",
    "fields": [
        {
            "name": "sepal length (cm)",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "sepal width (cm)",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "petal length (cm)",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "petal width (cm)",
            "type": [
                "null",
                "double"
            ]
        }
    ]
}


## Arrow and Pandas

In [None]:
# Arrow Schema from Pandas Dataframe
import json

import pandas as pd
import pyarrow as pa
from sklearn import datasets

# load iris dataset from sklearn
iris = datasets.load_iris()

# load iris dataset as pandas dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
table = pa.Table.from_pandas(df)
table.schema

sepal length (cm): double
sepal width (cm): double
petal length (cm): double
petal width (cm): double
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 797

In [None]:
table.schema.metadata

{b'pandas': b'{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 150, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "sepal length (cm)", "field_name": "sepal length (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "sepal width (cm)", "field_name": "sepal width (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "petal length (cm)", "field_name": "petal length (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "petal width (cm)", "field_name": "petal width (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}], "creator": {"library": "pyarrow", "version": "0.17.1"}, "pandas_version": "1.1.1"}'}

## Arrow and Parquet

In [None]:
# Parquet Schema from Arrow and to Arrow
import json

import pandas as pd
import pyarrow as pa
from sklearn import datasets

# load iris dataset from sklearn
iris = datasets.load_iris()

# load iris dataset as pandas dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
df.to_parquet("iris.parquet")

In [None]:
parquet_file = pa.parquet.ParquetFile("iris.parquet")
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x0000021E43CB0F98>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 4
  num_rows: 150
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 3170

In [None]:
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x0000021E43CE0548>
required group field_id=0 schema {
  optional double field_id=1 sepal length (cm);
  optional double field_id=2 sepal width (cm);
  optional double field_id=3 petal length (cm);
  optional double field_id=4 petal width (cm);
}

In [None]:
parquet_file.schema.to_arrow_schema()

sepal length (cm): double
  -- field metadata --
  PARQUET:field_id: '1'
sepal width (cm): double
  -- field metadata --
  PARQUET:field_id: '2'
petal length (cm): double
  -- field metadata --
  PARQUET:field_id: '3'
petal width (cm): double
  -- field metadata --
  PARQUET:field_id: '4'
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 797
ARROW:schema: '/////9AEAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABAwAQAAAAAAAKAAwAAA' + 1598

## JSON Schema

In [19]:
from jsonschema import validate

schema = {
  "type" : "object",
  "properties" : {
    "a_number" : {"type" : "number"},
    "a_name" : {"type" : "string"},
  },
}

schema_correct_sample = {"a_number": 123,
                         "a_name": "Green Eggs and Ham",
                         }
schema_incorrect_sample = {"a_number": "123",
                         "a_name": "Green Eggs and Ham",
                         }

In [20]:
validate(instance=schema_correct_sample, schema=schema)

In [21]:
validate(instance=schema_incorrect_sample, schema=schema)

ValidationError: ignored

In [22]:
schema = {
    "type" : "array",
    "items" :  {"type": "number"},
    "maxItems" : 5,
}

schema_correct_sample = [1,2,3,4,5]
schema_incorrect_sample = [1,2,3,4,5,6]

In [23]:
validate(instance=schema_correct_sample, schema=schema)

In [24]:
validate(instance=schema_incorrect_sample, schema=schema)

ValidationError: ignored

In [27]:
schema = {
    "type" : "array",
    "contains": {
     "type": "number"
    },
    "minItems" : 5,
}

schema_correct_sample = [1,2,3,4,"cat"]
schema_incorrect_sample = ["1","2","3","4","5"]

In [29]:
validate(instance=schema_correct_sample, schema=schema)

In [28]:
validate(instance=schema_incorrect_sample, schema=schema)

ValidationError: ignored