Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ This is particularly useful for data validation, migration testing, and ensuring
- **Advanced String Parsing:**
- Can detect and recursively parse JSON strings embedded within other file formats (e.g., a CSV field containing a JSON object).
- Identifies field patterns using a library of built-in regex matchers and supports custom matchers.
- **NEW**: Optional AI-powered pattern detection using embedded models (offline) or Claude/Anthropic APIs (online) to automatically generate regex patterns for data validation.
- **Intelligent Date/Time Handling:**
- Parses and compares `date`, `datetime`, and `timestamp` fields, even if their string formats differ between sources.
- Supports timestamps with variable precision.
Expand All @@ -43,6 +44,22 @@ source:
parser_config:
# Set to true to enable recursive parsing of string fields that look like JSON.
json_in_string: true

# Optional: Enable AI-powered pattern detection
pattern_detection:
enabled: true
mode: offline # or "online" for Claude/Anthropic API

# Offline mode (built-in pattern recognition)
offline_model:
# Uses built-in patterns for common data types

# Online mode configuration (for Claude/Anthropic API)
# online_api:
# provider: claude # or "anthropic"
# api_key: "your-api-key-here"
# model: "claude-3-haiku-20240307" # optional

# Optional: Define a schema to use instead of generating one.
# schema:
# key: user_id
Expand All @@ -59,6 +76,40 @@ To run a comparison, use the `compare` command and provide the paths to the two
go run ./cmd/comparator compare ./config1.yaml ./config2.yaml
```

## AI-Powered Pattern Detection

This tool includes optional AI-powered pattern detection to automatically identify regex patterns in your data fields and enhance schema generation. This feature helps complete the schema with appropriate matchers for data validation.

### Offline Mode
Uses built-in pattern recognition for common data types:
- Email addresses
- Phone numbers
- URLs
- IP addresses
- UUIDs
- Numeric values
- Date/time values

### Online Mode
Integrates with AI services (Claude/Anthropic) for more sophisticated pattern detection:
- Analyzes field samples using AI
- Generates custom regex patterns
- Supports complex data patterns beyond built-in types

### Configuration
Enable pattern detection in your config file:
```yaml
pattern_detection:
enabled: true
mode: offline # or "online"

# For online mode:
online_api:
provider: claude
api_key: "your-api-key"
model: "claude-3-haiku-20240307"
```

## Testing

This project is developed using a test-driven approach. A comprehensive suite of test cases, including source data and expected outputs, can be found in the `testdata` directory. These tests cover all major features and edge cases.
70 changes: 70 additions & 0 deletions demo/pattern_detection_demo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package main

import (
"data-comparator/internal/pkg/config"
"data-comparator/internal/pkg/datareader"
"data-comparator/internal/pkg/schema"
"fmt"
"log"

"gopkg.in/yaml.v3"
)

func main() {
// Demo 1: Basic schema generation (without pattern detection)
fmt.Println("=== Demo 1: Basic Schema Generation ===")
demoBasicSchema()

fmt.Println("\n=== Demo 2: Schema Generation with AI Pattern Detection (Offline Mode) ===")
demoPatternDetection()
}

func demoBasicSchema() {
cfg := &config.Config{
Source: config.Source{
Type: "csv",
Path: "testdata/testcase1_simple_csv/source1.csv",
},
}

reader, err := datareader.New(cfg.Source)
if err != nil {
log.Fatalf("Failed to create data reader: %v", err)
}
defer reader.Close()

schema, err := schema.Generate(reader, nil)
if err != nil {
log.Fatalf("Failed to generate schema: %v", err)
}

output, _ := yaml.Marshal(schema)
fmt.Printf("Basic Schema:\n%s\n", output)
}

func demoPatternDetection() {
cfg := &config.Config{
Source: config.Source{
Type: "csv",
Path: "testdata/testcase1_simple_csv/source1.csv",
},
PatternDetection: &config.PatternDetection{
Enabled: true,
Mode: "offline",
},
}

reader, err := datareader.New(cfg.Source)
if err != nil {
log.Fatalf("Failed to create data reader: %v", err)
}
defer reader.Close()

schema, err := schema.GenerateWithPatternDetection(reader, nil, cfg.PatternDetection)
if err != nil {
log.Fatalf("Failed to generate schema with pattern detection: %v", err)
}

output, _ := yaml.Marshal(schema)
fmt.Printf("Schema with AI Pattern Detection:\n%s\n", output)
}
21 changes: 21 additions & 0 deletions example_config_with_pattern_detection.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
source:
type: csv
path: testdata/testcase1_simple_csv/source1.csv

# Optional: Enable AI-powered pattern detection
pattern_detection:
enabled: true
mode: offline # or "online" for Claude/Anthropic API

# Offline mode configuration (using built-in patterns)
offline_model:
# No additional configuration needed for built-in patterns
# Future: model_path: "path/to/custom/model.onnx"
# Future: model_type: "onnx"

# Online mode configuration (for Claude/Anthropic API)
# online_api:
# provider: claude # or "anthropic"
# api_key: "your-api-key-here"
# model: "claude-3-haiku-20240307" # optional, defaults to fastest model
# endpoint: "https://api.anthropic.com/v1/messages" # optional, uses default
29 changes: 28 additions & 1 deletion internal/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ import (

// Config defines the structure of the user-provided YAML configuration file.
type Config struct {
Source Source `yaml:"source"`
Source Source `yaml:"source"`
PatternDetection *PatternDetection `yaml:"pattern_detection,omitempty"`
}

// Source defines the data source configuration.
Expand All @@ -30,6 +31,32 @@ type Sampler struct {
SampleSize int `yaml:"sample_size"`
}

// PatternDetection holds configuration for AI-powered pattern detection.
type PatternDetection struct {
Enabled bool `yaml:"enabled"`
Mode string `yaml:"mode"` // "offline" or "online"

// Offline mode configuration
OfflineModel *OfflineModelConfig `yaml:"offline_model,omitempty"`

// Online mode configuration (Claude/Anthropic)
OnlineAPI *OnlineAPIConfig `yaml:"online_api,omitempty"`
}

// OfflineModelConfig holds configuration for embedded AI model.
type OfflineModelConfig struct {
ModelPath string `yaml:"model_path,omitempty"` // Path to local model file
ModelType string `yaml:"model_type,omitempty"` // Type of model (e.g., "onnx", "tflite")
}

// OnlineAPIConfig holds configuration for online AI services.
type OnlineAPIConfig struct {
Provider string `yaml:"provider"` // "claude" or "anthropic"
APIKey string `yaml:"api_key"`
Model string `yaml:"model,omitempty"` // Model version to use
Endpoint string `yaml:"endpoint,omitempty"` // Custom endpoint if needed
}

// Load reads a YAML configuration file from the given path and returns a Config struct.
func Load(filePath string) (*Config, error) {
data, err := os.ReadFile(filePath)
Expand Down
49 changes: 49 additions & 0 deletions internal/pkg/patterndetection/detector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package patterndetection

import (
"data-comparator/internal/pkg/config"
"fmt"
)

// Matcher is a flexible map to represent matcher configurations,
// e.g., {"isNumeric": true} or {"regex": "pattern"}.
type Matcher map[string]interface{}

// PatternDetector interface defines methods for detecting regex patterns in field values.
type PatternDetector interface {
DetectPatterns(fieldName string, fieldType string, values []interface{}) ([]Matcher, error)
}

// DetectorFactory creates pattern detectors based on configuration.
type DetectorFactory struct {
config *config.PatternDetection
}

// NewDetectorFactory creates a new detector factory with the given configuration.
func NewDetectorFactory(cfg *config.PatternDetection) *DetectorFactory {
return &DetectorFactory{config: cfg}
}

// CreateDetector creates a pattern detector based on the configuration.
func (f *DetectorFactory) CreateDetector() (PatternDetector, error) {
if f.config == nil || !f.config.Enabled {
return &NoOpDetector{}, nil
}

switch f.config.Mode {
case "offline":
return NewOfflineDetector(f.config.OfflineModel)
case "online":
return NewOnlineDetector(f.config.OnlineAPI)
default:
return nil, fmt.Errorf("unsupported pattern detection mode: %s", f.config.Mode)
}
}

// NoOpDetector is a no-operation detector that returns empty matchers.
type NoOpDetector struct{}

// DetectPatterns implements PatternDetector interface with no-op behavior.
func (d *NoOpDetector) DetectPatterns(fieldName string, fieldType string, values []interface{}) ([]Matcher, error) {
return []Matcher{}, nil
}
Loading