diff --git a/README.md b/README.md index 359168e..a0a2b5f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ This is particularly useful for data validation, migration testing, and ensuring - **Advanced String Parsing:** - Can detect and recursively parse JSON strings embedded within other file formats (e.g., a CSV field containing a JSON object). - Identifies field patterns using a library of built-in regex matchers and supports custom matchers. + - **NEW**: Optional AI-powered pattern detection using embedded models (offline) or Claude/Anthropic APIs (online) to automatically generate regex patterns for data validation. - **Intelligent Date/Time Handling:** - Parses and compares `date`, `datetime`, and `timestamp` fields, even if their string formats differ between sources. - Supports timestamps with variable precision. @@ -43,6 +44,22 @@ source: parser_config: # Set to true to enable recursive parsing of string fields that look like JSON. json_in_string: true + +# Optional: Enable AI-powered pattern detection +pattern_detection: + enabled: true + mode: offline # or "online" for Claude/Anthropic API + + # Offline mode (built-in pattern recognition) + offline_model: + # Uses built-in patterns for common data types + + # Online mode configuration (for Claude/Anthropic API) + # online_api: + # provider: claude # or "anthropic" + # api_key: "your-api-key-here" + # model: "claude-3-haiku-20240307" # optional + # Optional: Define a schema to use instead of generating one. # schema: # key: user_id @@ -59,6 +76,40 @@ To run a comparison, use the `compare` command and provide the paths to the two go run ./cmd/comparator compare ./config1.yaml ./config2.yaml ``` +## AI-Powered Pattern Detection + +This tool includes optional AI-powered pattern detection to automatically identify regex patterns in your data fields and enhance schema generation. This feature helps complete the schema with appropriate matchers for data validation. + +### Offline Mode +Uses built-in pattern recognition for common data types: +- Email addresses +- Phone numbers +- URLs +- IP addresses +- UUIDs +- Numeric values +- Date/time values + +### Online Mode +Integrates with AI services (Claude/Anthropic) for more sophisticated pattern detection: +- Analyzes field samples using AI +- Generates custom regex patterns +- Supports complex data patterns beyond built-in types + +### Configuration +Enable pattern detection in your config file: +```yaml +pattern_detection: + enabled: true + mode: offline # or "online" + + # For online mode: + online_api: + provider: claude + api_key: "your-api-key" + model: "claude-3-haiku-20240307" +``` + ## Testing This project is developed using a test-driven approach. A comprehensive suite of test cases, including source data and expected outputs, can be found in the `testdata` directory. These tests cover all major features and edge cases. diff --git a/demo/pattern_detection_demo.go b/demo/pattern_detection_demo.go new file mode 100644 index 0000000..07f54da --- /dev/null +++ b/demo/pattern_detection_demo.go @@ -0,0 +1,70 @@ +package main + +import ( + "data-comparator/internal/pkg/config" + "data-comparator/internal/pkg/datareader" + "data-comparator/internal/pkg/schema" + "fmt" + "log" + + "gopkg.in/yaml.v3" +) + +func main() { + // Demo 1: Basic schema generation (without pattern detection) + fmt.Println("=== Demo 1: Basic Schema Generation ===") + demoBasicSchema() + + fmt.Println("\n=== Demo 2: Schema Generation with AI Pattern Detection (Offline Mode) ===") + demoPatternDetection() +} + +func demoBasicSchema() { + cfg := &config.Config{ + Source: config.Source{ + Type: "csv", + Path: "testdata/testcase1_simple_csv/source1.csv", + }, + } + + reader, err := datareader.New(cfg.Source) + if err != nil { + log.Fatalf("Failed to create data reader: %v", err) + } + defer reader.Close() + + schema, err := schema.Generate(reader, nil) + if err != nil { + log.Fatalf("Failed to generate schema: %v", err) + } + + output, _ := yaml.Marshal(schema) + fmt.Printf("Basic Schema:\n%s\n", output) +} + +func demoPatternDetection() { + cfg := &config.Config{ + Source: config.Source{ + Type: "csv", + Path: "testdata/testcase1_simple_csv/source1.csv", + }, + PatternDetection: &config.PatternDetection{ + Enabled: true, + Mode: "offline", + }, + } + + reader, err := datareader.New(cfg.Source) + if err != nil { + log.Fatalf("Failed to create data reader: %v", err) + } + defer reader.Close() + + schema, err := schema.GenerateWithPatternDetection(reader, nil, cfg.PatternDetection) + if err != nil { + log.Fatalf("Failed to generate schema with pattern detection: %v", err) + } + + output, _ := yaml.Marshal(schema) + fmt.Printf("Schema with AI Pattern Detection:\n%s\n", output) +} \ No newline at end of file diff --git a/example_config_with_pattern_detection.yaml b/example_config_with_pattern_detection.yaml new file mode 100644 index 0000000..a5d5994 --- /dev/null +++ b/example_config_with_pattern_detection.yaml @@ -0,0 +1,21 @@ +source: + type: csv + path: testdata/testcase1_simple_csv/source1.csv + +# Optional: Enable AI-powered pattern detection +pattern_detection: + enabled: true + mode: offline # or "online" for Claude/Anthropic API + + # Offline mode configuration (using built-in patterns) + offline_model: + # No additional configuration needed for built-in patterns + # Future: model_path: "path/to/custom/model.onnx" + # Future: model_type: "onnx" + + # Online mode configuration (for Claude/Anthropic API) + # online_api: + # provider: claude # or "anthropic" + # api_key: "your-api-key-here" + # model: "claude-3-haiku-20240307" # optional, defaults to fastest model + # endpoint: "https://api.anthropic.com/v1/messages" # optional, uses default \ No newline at end of file diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 9317fa5..2fc0c33 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -9,7 +9,8 @@ import ( // Config defines the structure of the user-provided YAML configuration file. type Config struct { - Source Source `yaml:"source"` + Source Source `yaml:"source"` + PatternDetection *PatternDetection `yaml:"pattern_detection,omitempty"` } // Source defines the data source configuration. @@ -30,6 +31,32 @@ type Sampler struct { SampleSize int `yaml:"sample_size"` } +// PatternDetection holds configuration for AI-powered pattern detection. +type PatternDetection struct { + Enabled bool `yaml:"enabled"` + Mode string `yaml:"mode"` // "offline" or "online" + + // Offline mode configuration + OfflineModel *OfflineModelConfig `yaml:"offline_model,omitempty"` + + // Online mode configuration (Claude/Anthropic) + OnlineAPI *OnlineAPIConfig `yaml:"online_api,omitempty"` +} + +// OfflineModelConfig holds configuration for embedded AI model. +type OfflineModelConfig struct { + ModelPath string `yaml:"model_path,omitempty"` // Path to local model file + ModelType string `yaml:"model_type,omitempty"` // Type of model (e.g., "onnx", "tflite") +} + +// OnlineAPIConfig holds configuration for online AI services. +type OnlineAPIConfig struct { + Provider string `yaml:"provider"` // "claude" or "anthropic" + APIKey string `yaml:"api_key"` + Model string `yaml:"model,omitempty"` // Model version to use + Endpoint string `yaml:"endpoint,omitempty"` // Custom endpoint if needed +} + // Load reads a YAML configuration file from the given path and returns a Config struct. func Load(filePath string) (*Config, error) { data, err := os.ReadFile(filePath) diff --git a/internal/pkg/patterndetection/detector.go b/internal/pkg/patterndetection/detector.go new file mode 100644 index 0000000..6ebf4e5 --- /dev/null +++ b/internal/pkg/patterndetection/detector.go @@ -0,0 +1,49 @@ +package patterndetection + +import ( + "data-comparator/internal/pkg/config" + "fmt" +) + +// Matcher is a flexible map to represent matcher configurations, +// e.g., {"isNumeric": true} or {"regex": "pattern"}. +type Matcher map[string]interface{} + +// PatternDetector interface defines methods for detecting regex patterns in field values. +type PatternDetector interface { + DetectPatterns(fieldName string, fieldType string, values []interface{}) ([]Matcher, error) +} + +// DetectorFactory creates pattern detectors based on configuration. +type DetectorFactory struct { + config *config.PatternDetection +} + +// NewDetectorFactory creates a new detector factory with the given configuration. +func NewDetectorFactory(cfg *config.PatternDetection) *DetectorFactory { + return &DetectorFactory{config: cfg} +} + +// CreateDetector creates a pattern detector based on the configuration. +func (f *DetectorFactory) CreateDetector() (PatternDetector, error) { + if f.config == nil || !f.config.Enabled { + return &NoOpDetector{}, nil + } + + switch f.config.Mode { + case "offline": + return NewOfflineDetector(f.config.OfflineModel) + case "online": + return NewOnlineDetector(f.config.OnlineAPI) + default: + return nil, fmt.Errorf("unsupported pattern detection mode: %s", f.config.Mode) + } +} + +// NoOpDetector is a no-operation detector that returns empty matchers. +type NoOpDetector struct{} + +// DetectPatterns implements PatternDetector interface with no-op behavior. +func (d *NoOpDetector) DetectPatterns(fieldName string, fieldType string, values []interface{}) ([]Matcher, error) { + return []Matcher{}, nil +} \ No newline at end of file diff --git a/internal/pkg/patterndetection/detector_test.go b/internal/pkg/patterndetection/detector_test.go new file mode 100644 index 0000000..117da06 --- /dev/null +++ b/internal/pkg/patterndetection/detector_test.go @@ -0,0 +1,149 @@ +package patterndetection + +import ( + "data-comparator/internal/pkg/config" + "testing" +) + +func TestOfflineDetector_DetectEmailPattern(t *testing.T) { + detector := &OfflineDetector{} + + testCases := []struct { + name string + values []interface{} + expected bool + }{ + { + name: "valid emails", + values: []interface{}{"alice@example.com", "bob@test.org", "charlie@domain.net"}, + expected: true, + }, + { + name: "mixed valid and invalid", + values: []interface{}{"alice@example.com", "not-an-email", "bob@test.org"}, + expected: false, + }, + { + name: "no emails", + values: []interface{}{"john doe", "123456", "not an email"}, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + matchers, err := detector.DetectPatterns("email", "string", tc.values) + if err != nil { + t.Fatalf("DetectPatterns failed: %v", err) + } + + hasEmailRegex := false + for _, matcher := range matchers { + if regex, ok := matcher["regex"]; ok { + if regex == `^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$` { + hasEmailRegex = true + break + } + } + } + + if hasEmailRegex != tc.expected { + t.Errorf("Expected email regex detection: %v, got: %v", tc.expected, hasEmailRegex) + } + }) + } +} + +func TestDetectorFactory_CreateDetector(t *testing.T) { + testCases := []struct { + name string + config *config.PatternDetection + wantType string + wantErr bool + }{ + { + name: "disabled", + config: &config.PatternDetection{Enabled: false}, + wantType: "*patterndetection.NoOpDetector", + wantErr: false, + }, + { + name: "offline mode", + config: &config.PatternDetection{Enabled: true, Mode: "offline"}, + wantType: "*patterndetection.OfflineDetector", + wantErr: false, + }, + { + name: "online mode with config", + config: &config.PatternDetection{ + Enabled: true, + Mode: "online", + OnlineAPI: &config.OnlineAPIConfig{APIKey: "test-key"}, + }, + wantType: "*patterndetection.OnlineDetector", + wantErr: false, + }, + { + name: "online mode without API key", + config: &config.PatternDetection{Enabled: true, Mode: "online"}, + wantType: "", + wantErr: true, + }, + { + name: "invalid mode", + config: &config.PatternDetection{Enabled: true, Mode: "invalid"}, + wantType: "", + wantErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + factory := NewDetectorFactory(tc.config) + detector, err := factory.CreateDetector() + + if tc.wantErr { + if err == nil { + t.Error("Expected error but got none") + } + return + } + + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + if detector == nil { + t.Fatal("Detector is nil") + } + + // Check type (basic type checking) + detectorType := "" + switch detector.(type) { + case *NoOpDetector: + detectorType = "*patterndetection.NoOpDetector" + case *OfflineDetector: + detectorType = "*patterndetection.OfflineDetector" + case *OnlineDetector: + detectorType = "*patterndetection.OnlineDetector" + } + + if detectorType != tc.wantType { + t.Errorf("Expected detector type %s, got %s", tc.wantType, detectorType) + } + }) + } +} + +func TestNoOpDetector_DetectPatterns(t *testing.T) { + detector := &NoOpDetector{} + + matchers, err := detector.DetectPatterns("test_field", "string", []interface{}{"value1", "value2"}) + if err != nil { + t.Fatalf("DetectPatterns failed: %v", err) + } + + if len(matchers) != 0 { + t.Errorf("Expected empty matchers, got %d", len(matchers)) + } +} \ No newline at end of file diff --git a/internal/pkg/patterndetection/offline.go b/internal/pkg/patterndetection/offline.go new file mode 100644 index 0000000..7804669 --- /dev/null +++ b/internal/pkg/patterndetection/offline.go @@ -0,0 +1,165 @@ +package patterndetection + +import ( + "data-comparator/internal/pkg/config" + "fmt" + "regexp" + "strings" +) + +// OfflineDetector uses built-in pattern recognition for detecting regex patterns. +type OfflineDetector struct { + config *config.OfflineModelConfig +} + +// NewOfflineDetector creates a new offline pattern detector. +func NewOfflineDetector(cfg *config.OfflineModelConfig) (*OfflineDetector, error) { + return &OfflineDetector{config: cfg}, nil +} + +// DetectPatterns analyzes field values and generates appropriate regex patterns. +func (d *OfflineDetector) DetectPatterns(fieldName string, fieldType string, values []interface{}) ([]Matcher, error) { + if len(values) == 0 { + return []Matcher{}, nil + } + + var matchers []Matcher + + // Convert values to strings for pattern analysis + stringValues := make([]string, 0, len(values)) + for _, val := range values { + if val != nil { + stringValues = append(stringValues, fmt.Sprintf("%v", val)) + } + } + + if len(stringValues) == 0 { + return []Matcher{}, nil + } + + // Apply built-in pattern detection logic + if pattern := d.detectEmailPattern(stringValues); pattern != "" { + matchers = append(matchers, Matcher{"regex": pattern}) + } else if pattern := d.detectPhonePattern(stringValues); pattern != "" { + matchers = append(matchers, Matcher{"regex": pattern}) + } else if pattern := d.detectURLPattern(stringValues); pattern != "" { + matchers = append(matchers, Matcher{"regex": pattern}) + } else if pattern := d.detectIPPattern(stringValues); pattern != "" { + matchers = append(matchers, Matcher{"regex": pattern}) + } else if pattern := d.detectUUIDPattern(stringValues); pattern != "" { + matchers = append(matchers, Matcher{"regex": pattern}) + } else if fieldType == "numeric" { + matchers = append(matchers, Matcher{"isNumeric": true}) + } else if fieldType == "datetime" { + matchers = append(matchers, Matcher{"isDateTime": true}) + } + + return matchers, nil +} + +// detectEmailPattern checks if values match email patterns. +func (d *OfflineDetector) detectEmailPattern(values []string) string { + emailRegex := regexp.MustCompile(`^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$`) + matchCount := 0 + + for _, val := range values { + if emailRegex.MatchString(val) { + matchCount++ + } + } + + // If more than 80% of values match email pattern, consider it an email field + if float64(matchCount)/float64(len(values)) > 0.8 { + return `^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$` + } + return "" +} + +// detectPhonePattern checks if values match phone number patterns. +func (d *OfflineDetector) detectPhonePattern(values []string) string { + phoneRegex := regexp.MustCompile(`^\+?[1-9]\d{1,14}$|^\(\d{3}\)\s\d{3}-\d{4}$|^\d{3}-\d{3}-\d{4}$|^\d{10,15}$`) + matchCount := 0 + + // Phone numbers should be at least 7 digits and contain some specific formatting patterns + for _, val := range values { + // Skip short numeric values that are likely not phone numbers + if len(val) < 7 { + continue + } + + // Check for phone-like patterns but exclude simple numbers like ages + if phoneRegex.MatchString(val) { + // Additional check: if all values are short numbers (like 2 digits), probably not phone numbers + if len(val) <= 3 { + continue + } + matchCount++ + } + } + + // Require higher threshold and longer values for phone detection + if float64(matchCount)/float64(len(values)) > 0.8 && len(values) > 0 { + // Double-check that most values look like phone numbers (longer than typical ages/IDs) + longValueCount := 0 + for _, val := range values { + if len(val) >= 7 { + longValueCount++ + } + } + if float64(longValueCount)/float64(len(values)) > 0.5 { + return `^\+?[1-9]\d{1,14}$|^\(\d{3}\)\s\d{3}-\d{4}$|^\d{3}-\d{3}-\d{4}$` + } + } + return "" +} + +// detectURLPattern checks if values match URL patterns. +func (d *OfflineDetector) detectURLPattern(values []string) string { + urlRegex := regexp.MustCompile(`^https?://[^\s/$.?#].[^\s]*$`) + matchCount := 0 + + for _, val := range values { + if urlRegex.MatchString(val) { + matchCount++ + } + } + + if float64(matchCount)/float64(len(values)) > 0.8 { + return `^https?://[^\s/$.?#].[^\s]*$` + } + return "" +} + +// detectIPPattern checks if values match IP address patterns. +func (d *OfflineDetector) detectIPPattern(values []string) string { + ipRegex := regexp.MustCompile(`^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$`) + matchCount := 0 + + for _, val := range values { + if ipRegex.MatchString(val) { + matchCount++ + } + } + + if float64(matchCount)/float64(len(values)) > 0.8 { + return `^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$` + } + return "" +} + +// detectUUIDPattern checks if values match UUID patterns. +func (d *OfflineDetector) detectUUIDPattern(values []string) string { + uuidRegex := regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$`) + matchCount := 0 + + for _, val := range values { + if uuidRegex.MatchString(strings.ToLower(val)) { + matchCount++ + } + } + + if float64(matchCount)/float64(len(values)) > 0.8 { + return `^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$` + } + return "" +} \ No newline at end of file diff --git a/internal/pkg/patterndetection/online.go b/internal/pkg/patterndetection/online.go new file mode 100644 index 0000000..daa3bed --- /dev/null +++ b/internal/pkg/patterndetection/online.go @@ -0,0 +1,199 @@ +package patterndetection + +import ( + "bytes" + "data-comparator/internal/pkg/config" + "encoding/json" + "fmt" + "io" + "net/http" + "regexp" + "strings" + "time" +) + +// OnlineDetector uses external AI APIs (Claude/Anthropic) for pattern detection. +type OnlineDetector struct { + config *config.OnlineAPIConfig + httpClient *http.Client +} + +// NewOnlineDetector creates a new online pattern detector. +func NewOnlineDetector(cfg *config.OnlineAPIConfig) (*OnlineDetector, error) { + if cfg == nil { + return nil, fmt.Errorf("online API configuration is required") + } + if cfg.APIKey == "" { + return nil, fmt.Errorf("API key is required for online mode") + } + if cfg.Provider == "" { + cfg.Provider = "claude" // Default to Claude + } + + return &OnlineDetector{ + config: cfg, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + }, nil +} + +// DetectPatterns uses AI API to analyze field values and generate regex patterns. +func (d *OnlineDetector) DetectPatterns(fieldName string, fieldType string, values []interface{}) ([]Matcher, error) { + if len(values) == 0 { + return []Matcher{}, nil + } + + // Sample values for AI analysis (limit to avoid huge API calls) + sampleValues := d.sampleValues(values, 10) + if len(sampleValues) == 0 { + return []Matcher{}, nil + } + + prompt := d.buildPrompt(fieldName, fieldType, sampleValues) + + var response string + var err error + + switch d.config.Provider { + case "claude", "anthropic": + response, err = d.callClaudeAPI(prompt) + default: + return nil, fmt.Errorf("unsupported provider: %s", d.config.Provider) + } + + if err != nil { + return nil, fmt.Errorf("failed to call AI API: %w", err) + } + + return d.parseAIResponse(response, fieldType) +} + +// sampleValues extracts a representative sample of values for AI analysis. +func (d *OnlineDetector) sampleValues(values []interface{}, maxSamples int) []string { + stringValues := make([]string, 0, len(values)) + seen := make(map[string]bool) + + for _, val := range values { + if val != nil { + str := fmt.Sprintf("%v", val) + if !seen[str] && len(stringValues) < maxSamples { + stringValues = append(stringValues, str) + seen[str] = true + } + } + } + + return stringValues +} + +// buildPrompt creates a prompt for the AI to analyze field patterns. +func (d *OnlineDetector) buildPrompt(fieldName string, fieldType string, sampleValues []string) string { + return fmt.Sprintf(`Analyze the following data field and generate appropriate regex patterns if applicable. + +Field Name: %s +Field Type: %s +Sample Values: +%s + +Please analyze these values and determine if they follow a specific pattern that can be captured with a regex. +If a clear pattern exists (like email addresses, phone numbers, URLs, UUIDs, etc.), provide ONLY the regex pattern. +If no clear pattern exists, respond with "NO_PATTERN". + +Rules: +1. Only return a single regex pattern or "NO_PATTERN" +2. The pattern should match at least 80%% of the provided samples +3. Focus on common data patterns: emails, phones, URLs, IDs, codes, etc. +4. Do not include explanations, just the regex or "NO_PATTERN" + +Response:`, fieldName, fieldType, strings.Join(sampleValues, "\n")) +} + +// callClaudeAPI makes a request to Claude/Anthropic API. +func (d *OnlineDetector) callClaudeAPI(prompt string) (string, error) { + endpoint := d.config.Endpoint + if endpoint == "" { + endpoint = "https://api.anthropic.com/v1/messages" + } + + model := d.config.Model + if model == "" { + model = "claude-3-haiku-20240307" // Use fastest, cheapest model for pattern detection + } + + requestBody := map[string]interface{}{ + "model": model, + "max_tokens": 100, // We only need a short response + "messages": []map[string]interface{}{ + { + "role": "user", + "content": prompt, + }, + }, + } + + jsonBody, err := json.Marshal(requestBody) + if err != nil { + return "", fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequest("POST", endpoint, bytes.NewBuffer(jsonBody)) + if err != nil { + return "", fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("x-api-key", d.config.APIKey) + req.Header.Set("anthropic-version", "2023-06-01") + + resp, err := d.httpClient.Do(req) + if err != nil { + return "", fmt.Errorf("failed to make request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(body)) + } + + var response struct { + Content []struct { + Text string `json:"text"` + } `json:"content"` + } + + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + return "", fmt.Errorf("failed to decode response: %w", err) + } + + if len(response.Content) == 0 { + return "", fmt.Errorf("empty response from API") + } + + return strings.TrimSpace(response.Content[0].Text), nil +} + +// parseAIResponse parses the AI response and creates appropriate matchers. +func (d *OnlineDetector) parseAIResponse(response, fieldType string) ([]Matcher, error) { + response = strings.TrimSpace(response) + + if response == "NO_PATTERN" || response == "" { + // Fall back to basic type-based matchers + var matchers []Matcher + if fieldType == "numeric" { + matchers = append(matchers, Matcher{"isNumeric": true}) + } else if fieldType == "datetime" { + matchers = append(matchers, Matcher{"isDateTime": true}) + } + return matchers, nil + } + + // Validate that the response is a valid regex pattern + _, err := regexp.Compile(response) + if err != nil { + return nil, fmt.Errorf("invalid regex pattern from AI: %s, error: %w", response, err) + } + + return []Matcher{{"regex": response}}, nil +} \ No newline at end of file diff --git a/internal/pkg/schema/generator.go b/internal/pkg/schema/generator.go index f06f890..94722be 100644 --- a/internal/pkg/schema/generator.go +++ b/internal/pkg/schema/generator.go @@ -3,6 +3,7 @@ package schema import ( "data-comparator/internal/pkg/config" "data-comparator/internal/pkg/datareader" + "data-comparator/internal/pkg/patterndetection" "fmt" "io" "strconv" @@ -14,6 +15,11 @@ const DefaultSampleSize = 1000 // Generate creates a schema by sampling records from a data reader. func Generate(reader datareader.DataReader, samplerConfig *config.Sampler) (*Schema, error) { + return GenerateWithPatternDetection(reader, samplerConfig, nil) +} + +// GenerateWithPatternDetection creates a schema with optional AI-powered pattern detection. +func GenerateWithPatternDetection(reader datareader.DataReader, samplerConfig *config.Sampler, patternConfig *config.PatternDetection) (*Schema, error) { sampleSize := DefaultSampleSize if samplerConfig != nil && samplerConfig.SampleSize > 0 { sampleSize = samplerConfig.SampleSize @@ -32,7 +38,14 @@ func Generate(reader datareader.DataReader, samplerConfig *config.Sampler) (*Sch CollectFieldValues(record, fieldValues) } - fields := analyzeFields(fieldValues) + // Create pattern detector + detectorFactory := patterndetection.NewDetectorFactory(patternConfig) + detector, err := detectorFactory.CreateDetector() + if err != nil { + return nil, fmt.Errorf("failed to create pattern detector: %w", err) + } + + fields := analyzeFieldsWithPatterns(fieldValues, detector) schema := &Schema{ Fields: fields, } @@ -52,6 +65,34 @@ func analyzeFields(fieldValues map[string][]interface{}) map[string]*Field { return fields } +func analyzeFieldsWithPatterns(fieldValues map[string][]interface{}, detector patterndetection.PatternDetector) map[string]*Field { + fields := make(map[string]*Field) + for name, values := range fieldValues { + fieldType := inferType(values) + + // Detect patterns for this field + detectedMatchers, err := detector.DetectPatterns(name, fieldType, values) + if err != nil { + // Log error but continue with basic field info + fmt.Printf("Warning: failed to detect patterns for field %s: %v\n", name, err) + detectedMatchers = []patterndetection.Matcher{} + } + + // Convert patterndetection.Matcher to schema.Matcher + matchers := make([]Matcher, len(detectedMatchers)) + for i, m := range detectedMatchers { + matchers[i] = Matcher(m) + } + + fields[name] = &Field{ + Type: fieldType, + Stats: []string{}, // TODO: Calculate stats based on type + Matchers: matchers, + } + } + return fields +} + func inferType(values []interface{}) string { if len(values) == 0 { return "unknown" diff --git a/internal/pkg/schema/schema_test.go b/internal/pkg/schema/schema_test.go index 203237c..245f1f3 100644 --- a/internal/pkg/schema/schema_test.go +++ b/internal/pkg/schema/schema_test.go @@ -55,6 +55,73 @@ func TestGenerate_SimpleCSV(t *testing.T) { } } +func TestGenerateWithPatternDetection_OfflineMode(t *testing.T) { + cfg, err := config.Load("../../../testdata/testcase1_simple_csv/config1.yaml") + if err != nil { + t.Fatalf("Failed to load config: %v", err) + } + cfg.Source.Path = "../../../" + cfg.Source.Path + + reader, err := datareader.New(cfg.Source) + if err != nil { + t.Fatalf("Failed to create data reader: %v", err) + } + defer reader.Close() + + // Enable offline pattern detection + patternConfig := &config.PatternDetection{ + Enabled: true, + Mode: "offline", + } + + schema, err := GenerateWithPatternDetection(reader, cfg.Source.Sampler, patternConfig) + if err != nil { + t.Fatalf("GenerateWithPatternDetection() error = %v", err) + } + + if schema == nil { + t.Fatal("Schema is nil") + } + + // Check if email field has regex pattern + emailField, ok := schema.Fields["email"] + if !ok { + t.Fatal("Email field not found") + } + + hasEmailRegex := false + for _, matcher := range emailField.Matchers { + if regex, ok := matcher["regex"]; ok { + if regex == `^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$` { + hasEmailRegex = true + break + } + } + } + + if !hasEmailRegex { + t.Error("Expected email field to have email regex pattern") + } + + // Check if numeric fields have isNumeric matcher + userIdField, ok := schema.Fields["user_id"] + if !ok { + t.Fatal("user_id field not found") + } + + hasNumericMatcher := false + for _, matcher := range userIdField.Matchers { + if isNumeric, ok := matcher["isNumeric"]; ok && isNumeric == true { + hasNumericMatcher = true + break + } + } + + if !hasNumericMatcher { + t.Error("Expected user_id field to have isNumeric matcher") + } +} + func TestCollectFieldValues(t *testing.T) { record := map[string]interface{}{ "id": float64(1),