Skip to content

colbyn/json-osi

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

6 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

JSON Obfuscated Schema Inference & Code Generator

CodeGen Supported Deserialization Support
JSON Schema N/A
Rust

This tool learns structure from messy, obfuscated JSON and generates:

  • JSON Schema (debug/inspection aid)
  • Strict Rust data models with serde deserializers

It is designed for data that has no explicit schema, especially when arrays are used as heterogeneous tuples with null padding.


Features

  • Order-independent schema inference using least-upper-bound (LUB) joins

  • Tuple vs list detection from evidence (not guesswork)

  • Strict Rust codegen:

    • Fixed-arity tuple structs
    • Required vs optional fields
    • Transparent newtypes with min/max checks
    • Pattern-checked strings & URI validators
  • JSON Schema-ish emitter for inspection/testing


Installation

Requires Rust (edition 2021+).

git clone https://github.com/colbyn/json-osi.git
cd json-osi
cargo build --release

The binary will be available in target/release/json-osi.


Usage

Generate a JSON Schema view

$ cargo run -- schema --input examples/samples.json --jq-expr='.[]'

Alternatively if you have json-osi installed:

json-osi schema --input examples/samples.json --jq-expr='.[]'

Generate strict Rust models

$ cargo run -- rust --input examples/samples.json --jq-expr='.[]' --out models.rs

Alternatively if you have json-osi installed:

json-osi rust --input examples/samples.json --jq-expr='.[]' --out models.rs
  • --input: one or more JSON files or glob patterns
  • --ndjson: treat input as newline-delimited JSON
  • --json-pointer: select a subnode (e.g. /data/items/0)
  • --jq-expr: pre-process with a jq filter

Example

Input samples:

[
    ["0ahUKEa1ZQ", "Acme Widgets", [null, [37.4219, -122.0840], null], "https://example.com", 4.3, true],
    ["0ahUKEa2ZQ", "Acme Widgets East", [null, [37.4200, -122.0830], null], null, 4.5, null]
]

Note: single .json use jq filter (--jq-expr='.[]') to extract top-level array as data model samples.

Generated Rust (models.rs):

// AUTOGENERATED: strict types + deserializers
use serde::{Deserialize, Deserializer};
use serde::de::{Error as DeError};
use once_cell::sync::Lazy;
use regex::Regex;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Null;

impl<'de> Deserialize<'de> for Null {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        struct V;
        impl<'de> serde::de::Visitor<'de> for V {
            type Value = Null;
            fn expecting(&self, f:&mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "null") }
            fn visit_unit<E>(self) -> Result<Null, E> where E: DeError { Ok(Null) }
            fn visit_none<E>(self) -> Result<Null, E> where E: DeError { Ok(Null) }
            fn visit_some<D>(self, _d: D) -> Result<Null, D::Error> where D: Deserializer<'de> {
                Err(DeError::invalid_type(serde::de::Unexpected::Other("non-null"), &"null"))
            }
        }
        de.deserialize_option(V)
    }
}
#[repr(transparent)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Root0(pub String);
impl std::ops::Deref for Root0 {
    type Target = String;
    fn deref(&self) -> &Self::Target { &self.0 }
}
static RE_ROOT0: Lazy<Regex> = Lazy::new(|| Regex::new("^0ahUKEa.*").unwrap());
impl<'de> Deserialize<'de> for Root0 {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        let s = String::deserialize(de)?;
        if !RE_ROOT0.is_match(&s) {
            return Err(DeError::custom("Root0: string failed pattern"));
        }
        Ok(Root0(s))
    }
}
#[repr(transparent)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Root1(pub String);
impl std::ops::Deref for Root1 {
    type Target = String;
    fn deref(&self) -> &Self::Target { &self.0 }
}
static RE_ROOT1: Lazy<Regex> = Lazy::new(|| Regex::new("^Acme Widgets.*").unwrap());
impl<'de> Deserialize<'de> for Root1 {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        let s = String::deserialize(de)?;
        if !RE_ROOT1.is_match(&s) {
            return Err(DeError::custom("Root1: string failed pattern"));
        }
        Ok(Root1(s))
    }
}
#[repr(transparent)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Root210(pub f64);
impl std::ops::Deref for Root210 {
    type Target = f64;
    fn deref(&self) -> &Self::Target { &self.0 }
}
impl<'de> Deserialize<'de> for Root210 {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        let x = f64::deserialize(de)?;
        if !x.is_finite() { return Err(DeError::custom("Root210: non-finite number")); }
        if x < 37.420000000000002 { return Err(DeError::custom("Root210: number below minimum")); }
        if x > 37.421900000000001 { return Err(DeError::custom("Root210: number above maximum")); }
        
        Ok(Root210(x))
    }
}
#[repr(transparent)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Root211(pub f64);
impl std::ops::Deref for Root211 {
    type Target = f64;
    fn deref(&self) -> &Self::Target { &self.0 }
}
impl<'de> Deserialize<'de> for Root211 {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        let x = f64::deserialize(de)?;
        if !x.is_finite() { return Err(DeError::custom("Root211: non-finite number")); }
        if x < -122.084000000000003 { return Err(DeError::custom("Root211: number below minimum")); }
        if x > -122.082999999999998 { return Err(DeError::custom("Root211: number above maximum")); }
        
        Ok(Root211(x))
    }
}
/// tuple len=2 (required first 2 slots)
#[derive(Debug, Deserialize)]
pub struct Root21(
    pub Root210,
    pub Root211,
);

/// tuple len=3 (required first 3 slots)
#[derive(Debug, Deserialize)]
pub struct Root2(
    pub Null,
    pub Root21,
    pub Null,
);

#[repr(transparent)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Root3(pub String);
impl std::ops::Deref for Root3 {
    type Target = String;
    fn deref(&self) -> &Self::Target { &self.0 }
}
static RE_ROOT3: Lazy<Regex> = Lazy::new(|| Regex::new("^https://example\\.com.*").unwrap());
impl<'de> Deserialize<'de> for Root3 {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        let s = String::deserialize(de)?;
        if !RE_ROOT3.is_match(&s) {
            return Err(DeError::custom("Root3: string failed pattern"));
        }
        Ok(Root3(s))
    }
}
#[repr(transparent)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Root4(pub f64);
impl std::ops::Deref for Root4 {
    type Target = f64;
    fn deref(&self) -> &Self::Target { &self.0 }
}
impl<'de> Deserialize<'de> for Root4 {
    fn deserialize<D>(de: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        let x = f64::deserialize(de)?;
        if !x.is_finite() { return Err(DeError::custom("Root4: non-finite number")); }
        if x < 4.3 { return Err(DeError::custom("Root4: number below minimum")); }
        if x > 4.5 { return Err(DeError::custom("Root4: number above maximum")); }
        
        Ok(Root4(x))
    }
}
/// tuple len=6 (required first 5 slots)
#[derive(Debug, Deserialize)]
pub struct Root(
    pub Root0,
    pub Root1,
    pub Root2,
    pub Option<Root3>,
    pub Root4,
    pub Option<bool>,
);

Status

  • ✅ Working inference engine
  • ✅ Rust + JSON Schema emitters
  • 🔧 Future: richer pattern detection, semantic type hints, optional list bound enforcement

The corresponding rust data-models as a rust crate can be found: github.com/colbyn/autogen-serp-types.


License

Copyright © 2025 Colbyn Wadman. All Rights Reserved.

About

JSON Obfuscated Schema Inference & Code Generator

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published