diff --git a/args/llm.go b/args/llm.go new file mode 100644 index 0000000..1816094 --- /dev/null +++ b/args/llm.go @@ -0,0 +1,81 @@ +package args + +import ( + "encoding/json" + "errors" + "fmt" + + teetypes "github.com/masa-finance/tee-types/types" +) + +var ( + ErrLLMDatasetIdRequired = errors.New("dataset id is required") + ErrLLMPromptRequired = errors.New("prompt is required") + ErrLLMMaxTokensNegative = errors.New("max tokens must be non-negative") +) + +const ( + LLMDefaultMaxTokens = 300 + LLMDefaultTemperature = "0.1" + LLMDefaultMultipleColumns = false + LLMDefaultModel = "gemini-1.5-flash-8b" +) + +type LLMProcessorArguments struct { + DatasetId string `json:"dataset_id"` + Prompt string `json:"prompt"` + MaxTokens int `json:"max_tokens"` + Temperature string `json:"temperature"` +} + +// UnmarshalJSON implements custom JSON unmarshaling with validation +func (l *LLMProcessorArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) + type Alias LLMProcessorArguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(l), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal llm arguments: %w", err) + } + + l.setDefaultValues() + + return l.Validate() +} + +func (l *LLMProcessorArguments) setDefaultValues() { + if l.MaxTokens == 0 { + l.MaxTokens = LLMDefaultMaxTokens + } + if l.Temperature == "" { + l.Temperature = LLMDefaultTemperature + } +} + +func (l *LLMProcessorArguments) Validate() error { + if l.DatasetId == "" { + return ErrLLMDatasetIdRequired + } + if l.Prompt == "" { + return ErrLLMPromptRequired + } + if l.MaxTokens < 0 { + return fmt.Errorf("%w: got %v", ErrLLMMaxTokensNegative, l.MaxTokens) + } + return nil +} + +func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequest { + return teetypes.LLMProcessorRequest{ + InputDatasetId: l.DatasetId, + Prompt: l.Prompt, + MaxTokens: l.MaxTokens, + Temperature: l.Temperature, + MultipleColumns: LLMDefaultMultipleColumns, // overrides default in actor API + Model: LLMDefaultModel, // overrides default in actor API + } +} diff --git a/args/llm_test.go b/args/llm_test.go new file mode 100644 index 0000000..3884ebf --- /dev/null +++ b/args/llm_test.go @@ -0,0 +1,136 @@ +package args_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" +) + +var _ = Describe("LLMProcessorArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should set default values", func() { + llmArgs := args.LLMProcessorArguments{ + DatasetId: "ds1", + Prompt: "summarize: ${markdown}", + } + jsonData, err := json.Marshal(llmArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &llmArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(llmArgs.MaxTokens).To(Equal(300)) + Expect(llmArgs.Temperature).To(Equal("0.1")) + }) + + It("should override default values", func() { + llmArgs := args.LLMProcessorArguments{ + DatasetId: "ds1", + Prompt: "summarize: ${markdown}", + MaxTokens: 123, + Temperature: "0.7", + } + jsonData, err := json.Marshal(llmArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &llmArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(llmArgs.MaxTokens).To(Equal(123)) + Expect(llmArgs.Temperature).To(Equal("0.7")) + }) + + It("should fail unmarshal when dataset_id is missing", func() { + var llmArgs args.LLMProcessorArguments + jsonData := []byte(`{"type":"datasetprocessor","prompt":"p"}`) + err := json.Unmarshal(jsonData, &llmArgs) + Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) + }) + + It("should fail unmarshal when prompt is missing", func() { + var llmArgs args.LLMProcessorArguments + jsonData := []byte(`{"type":"datasetprocessor","dataset_id":"ds1"}`) + err := json.Unmarshal(jsonData, &llmArgs) + Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + llmArgs := &args.LLMProcessorArguments{ + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 10, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when dataset_id is missing", func() { + llmArgs := &args.LLMProcessorArguments{ + Prompt: "p", + MaxTokens: 10, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) + }) + + It("should fail when prompt is missing", func() { + llmArgs := &args.LLMProcessorArguments{ + DatasetId: "ds1", + MaxTokens: 10, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) + }) + + It("should fail when max tokens is negative", func() { + llmArgs := &args.LLMProcessorArguments{ + DatasetId: "ds1", + Prompt: "p", + MaxTokens: -1, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(errors.Is(err, args.ErrLLMMaxTokensNegative)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got -1")) + }) + }) + + Describe("ToLLMProcessorRequest", func() { + It("should map fields and defaults correctly", func() { + llmArgs := args.LLMProcessorArguments{ + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 0, // default applied in To* + Temperature: "", + } + req := llmArgs.ToLLMProcessorRequest() + Expect(req.InputDatasetId).To(Equal("ds1")) + Expect(req.Prompt).To(Equal("p")) + Expect(req.MaxTokens).To(Equal(0)) + Expect(req.Temperature).To(Equal("")) + Expect(req.MultipleColumns).To(BeFalse()) + Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) + }) + + It("should map fields correctly when set", func() { + llmArgs := args.LLMProcessorArguments{ + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 42, + Temperature: "0.7", + } + req := llmArgs.ToLLMProcessorRequest() + Expect(req.InputDatasetId).To(Equal("ds1")) + Expect(req.Prompt).To(Equal("p")) + Expect(req.MaxTokens).To(Equal(42)) + Expect(req.Temperature).To(Equal("0.7")) + Expect(req.MultipleColumns).To(BeFalse()) + Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) + }) + }) +}) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index ce6bb49..1d3c26d 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -10,52 +10,9 @@ import ( // JobArguments defines the interface that all job arguments must implement type JobArguments interface { - Validate() error GetCapability() types.Capability } -// TwitterJobArguments extends JobArguments for Twitter-specific methods -type TwitterJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error - IsSingleTweetOperation() bool - IsMultipleTweetOperation() bool - IsSingleProfileOperation() bool - IsMultipleProfileOperation() bool - IsSingleSpaceOperation() bool - IsTrendsOperation() bool -} - -// WebJobArguments extends JobArguments for Web-specific methods -type WebJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error - IsDeepScrape() bool - HasSelector() bool - GetEffectiveMaxDepth() int -} - -// TikTokJobArguments extends JobArguments for TikTok-specific methods -type TikTokJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error - HasLanguagePreference() bool - GetVideoURL() string - GetLanguageCode() string -} - -// LinkedInJobArguments extends JobArguments for LinkedIn-specific methods -type LinkedInJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error -} - -// RedditJobArguments extends JobArguments for Reddit-specific methods -type RedditJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error -} - // UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct // This works with both tee-indexer and tee-worker JobArguments types func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { @@ -84,8 +41,8 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum } // Helper functions for unmarshaling specific argument types -func unmarshalWebArguments(args map[string]any) (*WebSearchArguments, error) { - webArgs := &WebSearchArguments{} +func unmarshalWebArguments(args map[string]any) (*WebArguments, error) { + webArgs := &WebArguments{} if err := unmarshalToStruct(args, webArgs); err != nil { return nil, fmt.Errorf("failed to unmarshal web job arguments: %w", err) } diff --git a/args/unmarshaller_test.go b/args/unmarshaller_test.go index 04e784f..4231cbd 100644 --- a/args/unmarshaller_test.go +++ b/args/unmarshaller_test.go @@ -14,15 +14,13 @@ var _ = Describe("Unmarshaller", func() { It("should unmarshal the arguments correctly", func() { argsMap := map[string]any{ "url": "https://example.com", - "selector": "h1", "max_depth": 2, } jobArgs, err := args.UnmarshalJobArguments(types.WebJob, argsMap) Expect(err).ToNot(HaveOccurred()) - webArgs, ok := jobArgs.(*args.WebSearchArguments) + webArgs, ok := jobArgs.(*args.WebArguments) Expect(ok).To(BeTrue()) Expect(webArgs.URL).To(Equal("https://example.com")) - Expect(webArgs.Selector).To(Equal("h1")) Expect(webArgs.MaxDepth).To(Equal(2)) }) }) diff --git a/args/web.go b/args/web.go index 33a466d..561aa59 100644 --- a/args/web.go +++ b/args/web.go @@ -2,24 +2,39 @@ package args import ( "encoding/json" + "errors" "fmt" "net/url" - "github.com/masa-finance/tee-types/pkg/util" teetypes "github.com/masa-finance/tee-types/types" ) -type WebSearchArguments struct { - URL string `json:"url"` - Selector string `json:"selector"` - Depth int `json:"depth"` - MaxDepth int `json:"max_depth"` +var ( + ErrWebURLRequired = errors.New("url is required") + ErrWebURLInvalid = errors.New("invalid URL format") + ErrWebURLSchemeMissing = errors.New("url must include a scheme (http:// or https://)") + ErrWebMaxDepth = errors.New("max depth must be non-negative") + ErrWebMaxPages = errors.New("max pages must be at least 1") +) + +const ( + WebDefaultMaxPages = 1 + WebDefaultMethod = "GET" + WebDefaultRespectRobotsTxtFile = false + WebDefaultSaveMarkdown = true +) + +type WebArguments struct { + QueryType teetypes.WebQueryType `json:"type"` + URL string `json:"url"` + MaxDepth int `json:"max_depth"` + MaxPages int `json:"max_pages"` } // UnmarshalJSON implements custom JSON unmarshaling with validation -func (w *WebSearchArguments) UnmarshalJSON(data []byte) error { +func (w *WebArguments) UnmarshalJSON(data []byte) error { // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias WebSearchArguments + type Alias WebArguments aux := &struct { *Alias }{ @@ -27,46 +42,50 @@ func (w *WebSearchArguments) UnmarshalJSON(data []byte) error { } if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal Web arguments: %w", err) + return fmt.Errorf("failed to unmarshal web arguments: %w", err) } + w.setDefaultValues() + return w.Validate() } +func (w *WebArguments) setDefaultValues() { + if w.MaxPages == 0 { + w.MaxPages = WebDefaultMaxPages + } +} + // Validate validates the Web arguments -func (w *WebSearchArguments) Validate() error { +func (w *WebArguments) Validate() error { if w.URL == "" { - return fmt.Errorf("url is required") + return ErrWebURLRequired } // Validate URL format parsedURL, err := url.Parse(w.URL) if err != nil { - return fmt.Errorf("invalid URL format: %w", err) + return fmt.Errorf("%w: %v", ErrWebURLInvalid, err) } // Ensure URL has a scheme if parsedURL.Scheme == "" { - return fmt.Errorf("URL must include a scheme (http:// or https://)") + return ErrWebURLSchemeMissing } if w.MaxDepth < 0 { - return fmt.Errorf("max_depth must be non-negative, got: %d", w.MaxDepth) + return fmt.Errorf("%w: got %v", ErrWebMaxDepth, w.MaxDepth) } - if w.Depth < 0 { - return fmt.Errorf("depth must be non-negative, got: %d", w.Depth) - } - - if w.Depth > w.MaxDepth && w.MaxDepth > 0 { - return fmt.Errorf("depth (%d) cannot exceed max_depth (%d)", w.Depth, w.MaxDepth) + if w.MaxPages < 1 { + return fmt.Errorf("%w: got %v", ErrWebMaxPages, w.MaxPages) } return nil } // ValidateForJobType validates Web arguments for a specific job type -func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error { +func (w *WebArguments) ValidateForJobType(jobType teetypes.JobType) error { if err := w.Validate(); err != nil { return err } @@ -76,21 +95,18 @@ func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error } // GetCapability returns the capability for web operations (always scraper) -func (w *WebSearchArguments) GetCapability() teetypes.Capability { +func (w *WebArguments) GetCapability() teetypes.Capability { return teetypes.CapScraper } -// IsDeepScrape returns true if this is a deep scraping operation -func (w *WebSearchArguments) IsDeepScrape() bool { - return w.MaxDepth > 1 || w.Depth > 0 -} - -// HasSelector returns true if a CSS selector is specified -func (w *WebSearchArguments) HasSelector() bool { - return w.Selector != "" -} - -// GetEffectiveMaxDepth returns the effective maximum depth for scraping -func (w *WebSearchArguments) GetEffectiveMaxDepth() int { - return util.Max(w.MaxDepth, 1) +func (w WebArguments) ToWebScraperRequest() teetypes.WebScraperRequest { + return teetypes.WebScraperRequest{ + StartUrls: []teetypes.WebStartURL{ + {URL: w.URL, Method: WebDefaultMethod}, + }, + MaxCrawlDepth: w.MaxDepth, + MaxCrawlPages: w.MaxPages, + RespectRobotsTxtFile: WebDefaultRespectRobotsTxtFile, + SaveMarkdown: WebDefaultSaveMarkdown, + } } diff --git a/args/web_test.go b/args/web_test.go new file mode 100644 index 0000000..77e771f --- /dev/null +++ b/args/web_test.go @@ -0,0 +1,158 @@ +package args_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/types" +) + +var _ = Describe("WebArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should set default values", func() { + webArgs := args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 0, + } + jsonData, err := json.Marshal(webArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &webArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(webArgs.MaxPages).To(Equal(1)) + }) + + It("should override default values", func() { + webArgs := args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 2, + MaxPages: 5, + } + jsonData, err := json.Marshal(webArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &webArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(webArgs.MaxPages).To(Equal(5)) + }) + + It("should fail unmarshal when url is missing", func() { + var webArgs args.WebArguments + jsonData := []byte(`{"type":"scraper","max_depth":1,"max_pages":1}`) + err := json.Unmarshal(jsonData, &webArgs) + Expect(errors.Is(err, args.ErrWebURLRequired)).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 2, + MaxPages: 3, + } + err := webArgs.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when url is missing", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + MaxDepth: 0, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebURLRequired)).To(BeTrue()) + }) + + It("should fail with an invalid URL format", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "http:// invalid.com", + MaxDepth: 0, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebURLInvalid)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("invalid URL format")) + }) + + It("should fail when scheme is missing", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "example.com", + MaxDepth: 0, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebURLSchemeMissing)).To(BeTrue()) + }) + + It("should fail when max depth is negative", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: -1, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebMaxDepth)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got -1")) + }) + + It("should fail when max pages is less than 1", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 0, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebMaxPages)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got 0")) + }) + }) + + Describe("Job capability", func() { + It("should return the scraper capability", func() { + webArgs := &args.WebArguments{} + Expect(webArgs.GetCapability()).To(Equal(types.CapScraper)) + }) + + It("should validate capability for WebJob", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 1, + MaxPages: 1, + } + err := webArgs.ValidateForJobType(types.WebJob) + Expect(err).ToNot(HaveOccurred()) + }) + }) + + Describe("ToWebScraperRequest", func() { + It("should map fields correctly", func() { + webArgs := args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 2, + MaxPages: 3, + } + req := webArgs.ToWebScraperRequest() + Expect(req.StartUrls).To(HaveLen(1)) + Expect(req.StartUrls[0].URL).To(Equal("https://example.com")) + Expect(req.StartUrls[0].Method).To(Equal("GET")) + Expect(req.MaxCrawlDepth).To(Equal(2)) + Expect(req.MaxCrawlPages).To(Equal(3)) + Expect(req.RespectRobotsTxtFile).To(BeFalse()) + Expect(req.SaveMarkdown).To(BeTrue()) + }) + }) +}) diff --git a/types/jobs.go b/types/jobs.go index fe66292..1a7d46b 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -24,7 +24,7 @@ func (j JobType) ValidateCapability(capability Capability) error { } if !slices.Contains(validCaps, capability) { - return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", + return fmt.Errorf("capability '%s' is not valid for job type '%s'. valid capabilities: %v", capability, j, validCaps) } @@ -74,7 +74,7 @@ const ( CapGetFollowing Capability = "getfollowing" CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" - CapGetProfile Capability = "getprofile" // LinkedIn get profile capability + CapGetProfile Capability = "getprofile" // Reddit capabilities CapScrapeUrls Capability = "scrapeurls" CapSearchPosts Capability = "searchposts" @@ -86,14 +86,12 @@ const ( // Capability group constants for easy reuse var ( - AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ - WebJob: AlwaysAvailableWebCaps, TelemetryJob: AlwaysAvailableTelemetryCaps, TiktokJob: AlwaysAvailableTiktokCaps, } @@ -118,6 +116,9 @@ var ( // RedditCaps are all the Reddit capabilities (only available with Apify) RedditCaps = []Capability{CapScrapeUrls, CapSearchPosts, CapSearchUsers, CapSearchCommunities} + + // WebCaps are all the Web capabilities (only available with Apify) + WebCaps = []Capability{CapScraper, CapEmpty} ) // JobCapabilityMap defines which capabilities are valid for each job type @@ -137,7 +138,7 @@ var JobCapabilityMap = map[JobType][]Capability{ TwitterApifyJob: TwitterApifyCaps, // Web job capabilities - WebJob: AlwaysAvailableWebCaps, + WebJob: WebCaps, // TikTok job capabilities TiktokJob: combineCapabilities( diff --git a/types/llm.go b/types/llm.go new file mode 100644 index 0000000..fb67693 --- /dev/null +++ b/types/llm.go @@ -0,0 +1,15 @@ +package types + +type LLMProcessorRequest struct { + InputDatasetId string `json:"inputDatasetId"` + LLMProviderApiKey string `json:"llmProviderApiKey"` // encrypted api key by miner + Model string `json:"model"` + MultipleColumns bool `json:"multipleColumns"` + Prompt string `json:"prompt"` // example: summarize the content of this webpage: ${markdown} + Temperature string `json:"temperature"` + MaxTokens int `json:"maxTokens"` +} + +type LLMProcessorResult struct { + LLMResponse string `json:"llmresponse"` +} diff --git a/types/web.go b/types/web.go new file mode 100644 index 0000000..dda1cea --- /dev/null +++ b/types/web.go @@ -0,0 +1,55 @@ +package types + +import ( + "time" +) + +// WebStartURL represents a single start URL configuration for web scraping +type WebStartURL struct { + URL string `json:"url"` + Method string `json:"method"` +} + +type WebQueryType string + +const ( + WebScraper WebQueryType = "scraper" +) + +// WebScraperRequest represents the customizable configuration for web scraping operations +type WebScraperRequest struct { + StartUrls []WebStartURL `json:"startUrls"` + MaxCrawlDepth int `json:"maxCrawlDepth"` + MaxCrawlPages int `json:"maxCrawlPages"` + RespectRobotsTxtFile bool `json:"respectRobotsTxtFile"` + SaveMarkdown bool `json:"saveMarkdown"` +} + +// WebCrawlInfo contains information about the crawling process +type WebCrawlInfo struct { + LoadedURL string `json:"loadedUrl"` + LoadedTime time.Time `json:"loadedTime"` + ReferrerURL string `json:"referrerUrl"` + Depth int `json:"depth"` + HTTPStatusCode int `json:"httpStatusCode"` +} + +// WebMetadata contains metadata extracted from the scraped page +type WebMetadata struct { + CanonicalURL string `json:"canonicalUrl"` + Title string `json:"title"` + Description *string `json:"description"` + Author *string `json:"author"` + Keywords *string `json:"keywords"` + LanguageCode *string `json:"languageCode"` +} + +// WebScraperResult represents the complete result from web scraping a single page +type WebScraperResult struct { + URL string `json:"url"` + Crawl WebCrawlInfo `json:"crawl"` + Metadata WebMetadata `json:"metadata"` + Text string `json:"text"` + Markdown string `json:"markdown"` + LLMResponse string `json:"llmresponse,omitempty"` // populated by LLM processor +}