From 741a19bc5e1e96b04467d4d61eb8d84d555ab865 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 5 Sep 2025 20:43:24 +0200 Subject: [PATCH 01/43] chore: bump tee types --- go.mod | 3 ++- go.sum | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index ad14f922..448d4ca6 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,8 @@ require ( github.com/joho/godotenv v1.5.1 github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 - github.com/masa-finance/tee-types v1.1.13 + // FIXME: replace when new version is released + github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 6cf96c43..630512d4 100644 --- a/go.sum +++ b/go.sum @@ -59,6 +59,8 @@ github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0 github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= github.com/masa-finance/tee-types v1.1.13 h1:bVXUEF8nXT3bhJE4kcDwcuzfQopid9BbIp0/OucClL4= github.com/masa-finance/tee-types v1.1.13/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06 h1:VPJyZ5M55OjEObOyQq330xBLJ8eyHSfDAQaA7ZC9vec= +github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= From 7e465b42385ac653c955ce5219029afd91628c66 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 5 Sep 2025 22:02:27 +0200 Subject: [PATCH 02/43] feat: adds web scraping client and test --- internal/capabilities/detector.go | 15 ++ internal/config/config.go | 23 +++ internal/jobs/stats/stats.go | 6 +- internal/jobs/telemetry_test.go | 3 +- internal/jobs/web.go | 94 ++++++++++ internal/jobs/web_test.go | 127 ++++++++++++++ internal/jobs/webapify/client.go | 81 +++++++++ internal/jobs/webscraper.go | 282 ------------------------------ internal/jobs/webscraper_test.go | 102 ----------- 9 files changed, 345 insertions(+), 388 deletions(-) create mode 100644 internal/jobs/web.go create mode 100644 internal/jobs/web_test.go create mode 100644 internal/jobs/webapify/client.go delete mode 100644 internal/jobs/webscraper.go delete mode 100644 internal/jobs/webscraper_test.go diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index 005c60c0..b0514a5c 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -38,10 +38,12 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface accounts := jc.GetStringSlice("twitter_accounts", nil) apiKeys := jc.GetStringSlice("twitter_api_keys", nil) apifyApiKey := jc.GetString("apify_api_key", "") + geminiApiKey := jc.GetString("gemini_api_key", "") hasAccounts := len(accounts) > 0 hasApiKeys := len(apiKeys) > 0 hasApifyKey := hasValidApifyKey(apifyApiKey) + hasGeminiKey := hasValidGeminiKey(geminiApiKey) // Add Twitter-specific capabilities based on available authentication if hasAccounts { @@ -73,6 +75,10 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface s.Add(teetypes.TiktokSearchCaps...) capabilities[teetypes.TiktokJob] = s.Items() + if hasGeminiKey { + capabilities[teetypes.WebJob] = teetypes.WebCaps + capabilities[teetypes.LLMJob] = teetypes.LLMCaps + } } // Add general TwitterJob capability if any Twitter auth is available @@ -157,3 +163,12 @@ func hasValidApifyKey(apifyApiKey string) bool { logrus.Infof("Apify API key validated successfully during capability detection") return true } + +func hasValidGeminiKey(geminiApiKey string) bool { + if geminiApiKey == "" { + return false + } + + // TODO validate the gemini key with a handler + return true +} diff --git a/internal/config/config.go b/internal/config/config.go index e9c1b14c..c8292dd4 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -159,6 +159,14 @@ func ReadConfig() JobConfiguration { jc["apify_api_key"] = "" } + geminiApiKey := os.Getenv("GEMINI_API_KEY") + if geminiApiKey != "" { + logrus.Info("Gemini API key found") + jc["gemini_api_key"] = geminiApiKey + } else { + jc["gemini_api_key"] = "" + } + tikTokLang := os.Getenv("TIKTOK_DEFAULT_LANGUAGE") if tikTokLang == "" { tikTokLang = "eng-US" @@ -295,6 +303,21 @@ func (jc JobConfiguration) GetRedditConfig() RedditConfig { } } +// WebConfig represents the configuration needed for Web scraping via Apify +type WebConfig struct { + ApifyApiKey string + GeminiApiKey string +} + +// GetWebConfig constructs a WebConfig directly from the JobConfiguration +// This eliminates the need for JSON marshaling/unmarshaling +func (jc JobConfiguration) GetWebConfig() WebConfig { + return WebConfig{ + ApifyApiKey: jc.GetString("apify_api_key", ""), + GeminiApiKey: jc.GetString("gemini_api_key", ""), + } +} + // ParseLogLevel parses a string and returns the corresponding logrus.Level. func ParseLogLevel(logLevel string) logrus.Level { switch strings.ToLower(logLevel) { diff --git a/internal/jobs/stats/stats.go b/internal/jobs/stats/stats.go index ff7d3044..44946460 100644 --- a/internal/jobs/stats/stats.go +++ b/internal/jobs/stats/stats.go @@ -25,9 +25,9 @@ const ( TwitterAuthErrors StatType = "twitter_auth_errors" TwitterRateErrors StatType = "twitter_ratelimit_errors" TwitterXSearchQueries StatType = "twitterx_search" // TODO: investigate if this is needed or used... - WebSuccess StatType = "web_success" + WebQueries StatType = "web_queries" + WebScrapedPages StatType = "web_scraped_pages" WebErrors StatType = "web_errors" - WebInvalid StatType = "web_invalid" TikTokTranscriptionSuccess StatType = "tiktok_transcription_success" TikTokTranscriptionErrors StatType = "tiktok_transcription_errors" TikTokVideos StatType = "tiktok_returned_videos" @@ -65,7 +65,7 @@ type StatsCollector struct { Stats *Stats Chan chan AddStat jobServer capabilities.JobServerInterface - jobConfiguration config.JobConfiguration + jobConfiguration config.JobConfiguration } // StartCollector starts a goroutine that listens to a channel for AddStat messages and updates the stats accordingly. diff --git a/internal/jobs/telemetry_test.go b/internal/jobs/telemetry_test.go index e3175045..7c2b4732 100644 --- a/internal/jobs/telemetry_test.go +++ b/internal/jobs/telemetry_test.go @@ -33,7 +33,8 @@ var _ = Describe("Telemetry Job", func() { Context("Telemetry Data Fetching", func() { It("should fetch telemetry data and log it", func() { // Add some test stats to the collector - statsCollector.Add("test-worker-1", stats.WebSuccess, 5) + statsCollector.Add("test-worker-1", stats.WebQueries, 5) + statsCollector.Add("test-worker-1", stats.WebScrapedPages, 10) statsCollector.Add("test-worker-1", stats.WebErrors, 2) statsCollector.Add("test-worker-2", stats.TwitterScrapes, 10) statsCollector.Add("test-worker-2", stats.TwitterTweets, 50) diff --git a/internal/jobs/web.go b/internal/jobs/web.go new file mode 100644 index 00000000..3e3ced01 --- /dev/null +++ b/internal/jobs/web.go @@ -0,0 +1,94 @@ +package jobs + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/sirupsen/logrus" + + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/internal/config" + "github.com/masa-finance/tee-worker/internal/jobs/stats" + "github.com/masa-finance/tee-worker/internal/jobs/webapify" + "github.com/masa-finance/tee-worker/pkg/client" + + teeargs "github.com/masa-finance/tee-types/args" + teetypes "github.com/masa-finance/tee-types/types" +) + +// WebApifyClient defines the interface for the Web Apify client to allow mocking in tests +type WebApifyClient interface { + Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, client.Cursor, error) +} + +// NewWebApifyClient is a function variable that can be replaced in tests. +// It defaults to the actual implementation. +var NewWebApifyClient = func(apiKey string, statsCollector *stats.StatsCollector) (WebApifyClient, error) { + return webapify.NewClient(apiKey, statsCollector) +} + +type WebScraper struct { + configuration config.WebConfig + statsCollector *stats.StatsCollector + capabilities []teetypes.Capability +} + +func NewWebScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollector) *WebScraper { + cfg := jc.GetWebConfig() + logrus.Info("Web scraper via Apify initialized") + return &WebScraper{ + configuration: cfg, + statsCollector: statsCollector, + capabilities: teetypes.WebCaps, + } +} + +func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { + logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for Web scrape") + + jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + if err != nil { + msg := fmt.Errorf("failed to unmarshal job arguments: %w", err) + return types.JobResult{Error: msg.Error()}, msg + } + + webArgs, ok := jobArgs.(*teeargs.WebArguments) + if !ok { + return types.JobResult{Error: "invalid argument type for Web job"}, errors.New("invalid argument type") + } + logrus.Debugf("web job args: %+v", *webArgs) + + webClient, err := NewWebApifyClient(w.configuration.ApifyApiKey, w.statsCollector) + if err != nil { + return types.JobResult{Error: "error while scraping Web"}, fmt.Errorf("error creating Web Apify client: %w", err) + } + + resp, cursor, err := webClient.Scrape(j.WorkerID, *webArgs, client.EmptyCursor) + if err != nil { + return types.JobResult{Error: fmt.Sprintf("error while scraping Web: %s", err.Error())}, fmt.Errorf("error scraping Web: %w", err) + } + + data, err := json.Marshal(resp) + if err != nil { + return types.JobResult{Error: fmt.Sprintf("error marshalling Web response")}, fmt.Errorf("error marshalling Web response: %w", err) + } + + return types.JobResult{ + Data: data, + Job: j, + NextCursor: cursor.String(), + }, nil +} + +// GetStructuredCapabilities returns the structured capabilities supported by the Web scraper +// based on the available credentials and API keys +func (ws *WebScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { + capabilities := make(teetypes.WorkerCapabilities) + + if ws.configuration.ApifyApiKey != "" && ws.configuration.GeminiApiKey != "" { + capabilities[teetypes.WebJob] = teetypes.WebCaps + } + + return capabilities +} diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go new file mode 100644 index 00000000..598f7c99 --- /dev/null +++ b/internal/jobs/web_test.go @@ -0,0 +1,127 @@ +package jobs_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/internal/config" + "github.com/masa-finance/tee-worker/internal/jobs" + "github.com/masa-finance/tee-worker/internal/jobs/stats" + "github.com/masa-finance/tee-worker/pkg/client" + + teeargs "github.com/masa-finance/tee-types/args" + teetypes "github.com/masa-finance/tee-types/types" +) + +// MockWebApifyClient is a mock implementation of the WebApifyClient. +type MockWebApifyClient struct { + ScrapeFunc func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, client.Cursor, error) +} + +func (m *MockWebApifyClient) Scrape(_ string, args teeargs.WebArguments, _ client.Cursor) ([]*teetypes.WebScraperResult, client.Cursor, error) { + if m != nil && m.ScrapeFunc != nil { + res, next, err := m.ScrapeFunc(args) + return res, next, err + } + return nil, client.EmptyCursor, nil +} + +var _ = Describe("WebScraper", func() { + var ( + scraper *jobs.WebScraper + statsCollector *stats.StatsCollector + job types.Job + mockClient *MockWebApifyClient + ) + + BeforeEach(func() { + statsCollector = stats.StartCollector(128, config.JobConfiguration{}) + cfg := config.JobConfiguration{ + "apify_api_key": "test-key", + } + scraper = jobs.NewWebScraper(cfg, statsCollector) + mockClient = &MockWebApifyClient{} + + // Replace the client creation function with one that returns the mock + jobs.NewWebApifyClient = func(apiKey string, _ *stats.StatsCollector) (jobs.WebApifyClient, error) { + return mockClient, nil + } + + job = types.Job{ + UUID: "test-uuid", + Type: teetypes.WebJob, + } + }) + + Context("ExecuteJob", func() { + It("should return an error for invalid arguments", func() { + job.Arguments = map[string]any{"invalid": "args"} + result, err := scraper.ExecuteJob(job) + Expect(err).To(HaveOccurred()) + Expect(result.Error).To(ContainSubstring("failed to unmarshal job arguments")) + }) + + It("should call Scrape and return data and next cursor", func() { + job.Arguments = map[string]any{ + "type": teetypes.WebScraper, + "url": "https://example.com", + "max_depth": 1, + "max_pages": 2, + } + + mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, client.Cursor, error) { + Expect(args.URL).To(Equal("https://example.com")) + return []*teetypes.WebScraperResult{{URL: "https://example.com", Markdown: "# Hello"}}, client.Cursor("next-cursor"), nil + } + + result, err := scraper.ExecuteJob(job) + Expect(err).NotTo(HaveOccurred()) + Expect(result.NextCursor).To(Equal("next-cursor")) + var resp []*teetypes.WebScraperResult + err = json.Unmarshal(result.Data, &resp) + Expect(err).NotTo(HaveOccurred()) + Expect(resp).To(HaveLen(1)) + Expect(resp[0]).NotTo(BeNil()) + Expect(resp[0].URL).To(Equal("https://example.com")) + }) + + It("should handle errors from the web client", func() { + job.Arguments = map[string]any{ + "type": teetypes.WebScraper, + "url": "https://example.com", + "max_depth": 0, + "max_pages": 1, + } + + expectedErr := errors.New("client error") + mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, client.Cursor, error) { + return nil, client.EmptyCursor, expectedErr + } + + result, err := scraper.ExecuteJob(job) + Expect(err).To(HaveOccurred()) + Expect(err).To(MatchError(ContainSubstring("client error"))) + Expect(result.Error).To(ContainSubstring("error while scraping Web: client error")) + }) + + It("should handle errors when creating the client", func() { + jobs.NewWebApifyClient = func(apiKey string, _ *stats.StatsCollector) (jobs.WebApifyClient, error) { + return nil, errors.New("client creation failed") + } + job.Arguments = map[string]any{ + "type": teetypes.WebScraper, + "url": "https://example.com", + "max_depth": 0, + "max_pages": 1, + } + + result, err := scraper.ExecuteJob(job) + Expect(err).To(HaveOccurred()) + Expect(result.Error).To(Equal("error while scraping Web")) + }) + }) +}) diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go new file mode 100644 index 00000000..ecd423f0 --- /dev/null +++ b/internal/jobs/webapify/client.go @@ -0,0 +1,81 @@ +package webapify + +import ( + "encoding/json" + "fmt" + + teeargs "github.com/masa-finance/tee-types/args" + teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/internal/jobs/stats" + "github.com/masa-finance/tee-worker/pkg/client" + "github.com/sirupsen/logrus" +) + +const ( + WebActorID = "apify~website-content-crawler" +) + +type WebApifyClient struct { + apifyClient client.Apify + statsCollector *stats.StatsCollector +} + +// NewInternalClient is a function variable that can be replaced in tests. +// It defaults to the actual implementation. +var NewInternalClient = func(apiKey string) (client.Apify, error) { + return client.NewApifyClient(apiKey) +} + +// NewClient creates a new Reddit Apify client +func NewClient(apiToken string, statsCollector *stats.StatsCollector) (*WebApifyClient, error) { + apifyClient, err := NewInternalClient(apiToken) + if err != nil { + return nil, fmt.Errorf("failed to create apify client: %w", err) + } + + return &WebApifyClient{ + apifyClient: apifyClient, + statsCollector: statsCollector, + }, nil +} + +// ValidateApiKey tests if the Apify API token is valid +func (c *WebApifyClient) ValidateApiKey() error { + return c.apifyClient.ValidateApiKey() +} + +func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, client.Cursor, error) { + if c.statsCollector != nil { + c.statsCollector.Add(workerID, stats.WebQueries, 1) + } + + input := args.ToWebScraperRequest() + + // TODO: limit could be greater than max pages if max depth is greater than 0? + // TODO: need to test this more thoroughly with various request types + limit := uint(args.MaxPages) + dataset, nextCursor, err := c.apifyClient.RunActorAndGetResponse(WebActorID, input, cursor, limit) + if err != nil { + if c.statsCollector != nil { + c.statsCollector.Add(workerID, stats.WebErrors, 1) + } + return nil, client.EmptyCursor, err + } + + response := make([]*teetypes.WebScraperResult, 0, len(dataset.Data.Items)) + + for i, item := range dataset.Data.Items { + var resp teetypes.WebScraperResult + if err := json.Unmarshal(item, &resp); err != nil { + logrus.Warnf("Failed to unmarshal scrape result at index %d: %v", i, err) + continue + } + response = append(response, &resp) + } + + if c.statsCollector != nil { + c.statsCollector.Add(workerID, stats.WebScrapedPages, uint(len(response))) + } + + return response, nextCursor, nil +} diff --git a/internal/jobs/webscraper.go b/internal/jobs/webscraper.go deleted file mode 100644 index e6b49fc9..00000000 --- a/internal/jobs/webscraper.go +++ /dev/null @@ -1,282 +0,0 @@ -package jobs - -import ( - "encoding/json" - "fmt" - "net/http" - "strconv" - "strings" - "time" - - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" - - "github.com/cenkalti/backoff" - "github.com/gocolly/colly" - "github.com/masa-finance/tee-worker/api/types" - "github.com/masa-finance/tee-worker/internal/config" - "github.com/masa-finance/tee-worker/internal/jobs/stats" - "github.com/sirupsen/logrus" -) - -type WebScraper struct { - configuration WebScraperConfiguration - stats *stats.StatsCollector -} - -type WebScraperConfiguration struct { - Blacklist []string `json:"webscraper_blacklist"` -} - -func NewWebScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollector) *WebScraper { - config := WebScraperConfiguration{} - jc.Unmarshal(&config) - return &WebScraper{ - configuration: config, - stats: statsCollector, - } -} - -// GetStructuredCapabilities returns the structured capabilities supported by the web scraper -func (ws *WebScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { - return teetypes.WorkerCapabilities{ - teetypes.WebJob: teetypes.AlwaysAvailableWebCaps, - } -} - -func (ws *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { - logrus.Info("Starting ExecuteJob for web scraper") - - // Step 1: Use centralized type-safe unmarshaller - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) - if err != nil { - logrus.Warnf("Failed to unmarshal job arguments: %v", err) - ws.stats.Add(j.WorkerID, stats.WebInvalid, 1) - return types.JobResult{Error: fmt.Sprintf("Invalid arguments: %v", err)}, nil - } - - // Type assert to Web arguments - args, ok := jobArgs.(*teeargs.WebSearchArguments) - if !ok { - logrus.Errorf("Expected Web arguments for job ID %s, type %s", j.UUID, j.Type) - return types.JobResult{Error: "invalid argument type for Web job"}, nil - } - logrus.Debugf("Job arguments unmarshaled and validated successfully: %+v", args) - - // Step 2: Validate URL against blacklist - logrus.Debug("Validating URL against blacklist") - for _, u := range ws.configuration.Blacklist { - logrus.Debugf("Checking if URL contains blacklisted term: %s", u) - if strings.Contains(args.URL, u) { - logrus.Warnf("URL %s is blacklisted due to term: %s", args.URL, u) - ws.stats.Add(j.WorkerID, stats.WebInvalid, 1) - return types.JobResult{ - Error: fmt.Sprintf("URL blacklisted: %s", args.URL), - }, nil - } - } - logrus.Infof("URL %s passed blacklist validation", args.URL) - - // Step 3: Use enhanced methods for cleaner logic and validation - logrus.Debugf("Initiating web scraping for URL: %s (max_depth: %d, has_selector: %t, is_deep_scrape: %t)", - args.URL, args.GetEffectiveMaxDepth(), args.HasSelector(), args.IsDeepScrape()) - - // Perform web scraping using the effective max depth - result, err := scrapeWeb([]string{args.URL}, args.GetEffectiveMaxDepth()) - if err != nil { - logrus.Warnf("Web scraping failed for URL %s: %v", args.URL, err) - ws.stats.Add(j.WorkerID, stats.WebErrors, 1) - return types.JobResult{Error: err.Error()}, err - } - logrus.Debugf("Web scraping succeeded for URL %s: %v", args.URL, string(result)) - - // Step 4: Process result and return - logrus.Debugf("Updating statistics for successful web scraping") - ws.stats.Add(j.WorkerID, stats.WebSuccess, 1) - logrus.Infof("Returning web scraping result for URL %s", args.URL) - return types.JobResult{ - Data: result, - }, nil -} - -// Section represents a distinct part of a scraped webpage, typically defined by a heading. -// It contains a Title, representing the heading of the section, and Paragraphs, a slice of strings -// containing the text content found within that section. -type Section struct { - Title string `json:"title"` // Title is the heading text of the section. - Paragraphs []string `json:"paragraphs"` // Paragraphs contains all the text content of the section. - Images []string `json:"images"` // Images storing base64 - maybe!!? -} - -// CollectedData represents the aggregated result of the scraping process. -// It contains a slice of Section structs, each representing a distinct part of a scraped webpage. -type CollectedData struct { - Sections []Section `json:"sections"` // Sections is a collection of webpage sections that have been scraped. - Pages []string `json:"pages"` -} - -// scrapeWeb initiates the scraping process for the given list of URIs. -// It returns a CollectedData struct containing the scraped sections from each URI, -// and an error if any occurred during the scraping process. -// -// Parameters: -// - uri: []string - list of URLs to scrape -// - depth: int - depth of how many subpages to scrape -// -// Returns: -// - []byte - JSON representation of the collected data -// - error - any error that occurred during the scraping process -// -// Example usage: -// -// go func() { -// res, err := scraper.scrapeWeb([]string{"https://en.wikipedia.org/wiki/Maize"}, 5) -// if err != nil { -// logrus.WithError(err).Error("Error collecting data") -// return -// } -// logrus.WithField("result", string(res)).Info("Scraping completed") -// }() -func scrapeWeb(uri []string, depth int) ([]byte, error) { - logrus.Infof("Starting scrapeWeb with parameters: URIs=%v, Depth=%d", uri, depth) - // Set default depth to 1 if 0 is provided - if depth <= 0 { - logrus.Infof("Invalid depth (%d) provided, setting default depth to 1", depth) - depth = 1 - } - - logrus.Info("Initializing CollectedData struct") - var collectedData CollectedData - - logrus.Info("Creating new Colly collector") - c := colly.NewCollector( - colly.Async(true), // Enable asynchronous requests - colly.AllowURLRevisit(), - colly.IgnoreRobotsTxt(), - colly.MaxDepth(depth), - ) - logrus.Info("Colly collector created successfully") - - // Adjust the parallelism and delay based on your needs and server capacity - logrus.Info("Setting scraping limits with parallelism and delay") - limitRule := colly.LimitRule{ - DomainGlob: "*", - Parallelism: 4, // Increased parallelism - Delay: 500 * time.Millisecond, // Reduced delay - } - logrus.Info("Applying scraping limits to the collector") - if err := c.Limit(&limitRule); err != nil { - logrus.Errorf("[-] Unable to set scraper limit. Using default. Error: %v", err) - } - - // Increase the timeout slightly if necessary - logrus.Info("Setting request timeout to 240 seconds") - c.SetRequestTimeout(240 * time.Second) - - // Initialize a backoff strategy - logrus.Info("Initializing exponential backoff strategy") - backoffStrategy := backoff.NewExponentialBackOff() - - logrus.Info("Registering OnError callback to handle request errors") - c.OnError(func(r *colly.Response, err error) { - logrus.Errorf("Error occurred during request to URL: %s. StatusCode: %d, Error: %v", r.Request.URL, r.StatusCode, err) - if r.StatusCode == http.StatusTooManyRequests { - // Parse the Retry-After header (in seconds) - retryAfter, convErr := strconv.Atoi(r.Headers.Get("Retry-After")) - if convErr != nil { - // If not in seconds, it might be a date. Handle accordingly. - logrus.Warnf("Retry-After header is present but unrecognized format: %s", r.Headers.Get("Retry-After")) - } - // Calculate the next delay - nextDelay := backoffStrategy.NextBackOff() - if retryAfter > 0 { - nextDelay = time.Duration(retryAfter) * time.Second - } - logrus.Warnf("Rate limited for URL: %s. Retrying after %v", r.Request.URL, nextDelay) - time.Sleep(nextDelay) - // Retry the request - logrus.Info("Retrying the request") - _ = r.Request.Retry() - - } else { - logrus.Errorf("Request failed for URL: %s with error: %v", r.Request.URL, err) - logrus.Errorf("[-] Request URL: %s failed with error: %v", r.Request.URL, err) - } - }) - - logrus.Info("Registering OnHTML callback for h1, h2 elements (titles)") - c.OnHTML("h1, h2", func(e *colly.HTMLElement) { - logrus.Infof("Title (h1/h2) found: %s", e.Text) - // Directly append a new Section to collectedData.Sections - collectedData.Sections = append(collectedData.Sections, Section{Title: e.Text}) - }) - - logrus.Info("Registering OnHTML callback for paragraph elements") - c.OnHTML("p", func(e *colly.HTMLElement) { - logrus.Infof("Paragraph detected: %s", e.Text) - // Check if there are any sections to append paragraphs to - if len(collectedData.Sections) > 0 { - // Get a reference to the last section - lastSection := &collectedData.Sections[len(collectedData.Sections)-1] - // Append the paragraph to the last section - // Check for duplicate paragraphs before appending - isDuplicate := false - for _, paragraph := range lastSection.Paragraphs { - if paragraph == e.Text { - isDuplicate = true - break - } - } - // Handle dupes - if !isDuplicate { - lastSection.Paragraphs = append(lastSection.Paragraphs, e.Text) - } - } - }) - - logrus.Info("Registering OnHTML callback for image elements") - c.OnHTML("img", func(e *colly.HTMLElement) { - logrus.Infof("Image detected with source URL: %s", e.Attr("src")) - imageURL := e.Request.AbsoluteURL(e.Attr("src")) - if len(collectedData.Sections) > 0 { - lastSection := &collectedData.Sections[len(collectedData.Sections)-1] - lastSection.Images = append(lastSection.Images, imageURL) - } - }) - - logrus.Info("Registering OnHTML callback for anchor elements") - c.OnHTML("a", func(e *colly.HTMLElement) { - logrus.Infof("Link detected: %s", e.Attr("href")) - pageURL := e.Request.AbsoluteURL(e.Attr("href")) - // Check if the URL protocol is supported (http or https) - if strings.HasPrefix(pageURL, "http://") || strings.HasPrefix(pageURL, "https://") { - collectedData.Pages = append(collectedData.Pages, pageURL) - _ = e.Request.Visit(pageURL) - } - }) - - logrus.Infof("Starting to visit URLs: %v", uri) - for _, u := range uri { - err := c.Visit(u) - if err != nil { - logrus.Errorf("Failed to visit URL: %s. Error: %v", u, err) - continue - } - logrus.Infof("Visiting URL: %s", u) - err = c.Visit(u) - if err != nil { - logrus.Errorf("Failed to visit URL: %s. Error: %v", u, err) - return nil, err - } - } - - // Wait for all requests to finish - logrus.Info("Waiting for all requests to complete") - c.Wait() - - logrus.Info("Scraping completed, marshaling collected data into JSON format") - j, _ := json.Marshal(collectedData) - - logrus.Infof("Scraping successful. Returning data for URIs: %v", uri) - return j, nil -} diff --git a/internal/jobs/webscraper_test.go b/internal/jobs/webscraper_test.go deleted file mode 100644 index 88da18ee..00000000 --- a/internal/jobs/webscraper_test.go +++ /dev/null @@ -1,102 +0,0 @@ -package jobs_test - -import ( - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - teetypes "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-worker/api/types" - "github.com/masa-finance/tee-worker/internal/config" - . "github.com/masa-finance/tee-worker/internal/jobs" - "github.com/masa-finance/tee-worker/internal/jobs/stats" -) - -var statsCollector *stats.StatsCollector - -var _ = Describe("Webscraper", func() { - BeforeEach(func() { - statsCollector = stats.StartCollector(128, config.JobConfiguration{}) - }) - - It("should scrape now", func() { - webScraper := NewWebScraper(config.JobConfiguration{}, statsCollector) - - j := types.Job{ - Type: teetypes.WebJob, - Arguments: map[string]interface{}{ - "url": "https://www.google.com", - }, - WorkerID: "test", - } - res, err := webScraper.ExecuteJob(j) - Expect(err).NotTo(HaveOccurred()) - Expect(res.Error).To(BeEmpty()) - - var scrapedData CollectedData - err = res.Unmarshal(&scrapedData) - Expect(err).NotTo(HaveOccurred()) - - Expect(scrapedData.Pages).ToNot(BeEmpty()) - - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebSuccess] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 1)) - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebErrors] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 0)) - }) - - It("does not return data with invalid hosts", func() { - webScraper := NewWebScraper(config.JobConfiguration{}, statsCollector) - - j := types.Job{ - Type: teetypes.WebJob, - Arguments: map[string]interface{}{ - "url": "google", - }, - WorkerID: "test", - } - res, err := webScraper.ExecuteJob(j) - Expect(err).NotTo(HaveOccurred()) - Expect(res.Error).To(Equal("Invalid arguments: failed to unmarshal web job arguments: failed to unmarshal arguments: URL must include a scheme (http:// or https://)")) - - // Don't attempt to unmarshal since the job failed - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebSuccess] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 0)) - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebErrors] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 0)) - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebInvalid] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 1)) - }) - - It("should allow to blacklist urls", func() { - webScraper := NewWebScraper(config.JobConfiguration{ - "webscraper_blacklist": []string{"https://google.com"}, - }, statsCollector) - - j := types.Job{ - Type: teetypes.WebJob, - Arguments: map[string]interface{}{ - "url": "https://google.com", - }, - WorkerID: "test", - } - res, err := webScraper.ExecuteJob(j) - Expect(err).ToNot(HaveOccurred()) - Expect(res.Error).To(Equal("URL blacklisted: https://google.com")) - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebSuccess] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 0)) - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebErrors] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 0)) - Eventually(func() uint { - return statsCollector.Stats.Stats[j.WorkerID][stats.WebInvalid] - }, 5*time.Second, 10*time.Millisecond).Should(BeNumerically("==", 1)) - }) -}) From 448840fd8f34f69d53416fc914f9fff4d9043cb8 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sun, 7 Sep 2025 17:44:19 +0200 Subject: [PATCH 03/43] chore: fix api test --- internal/api/api_test.go | 5 ++++- internal/jobs/web.go | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 3b1c2edb..006db298 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -10,6 +10,9 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" + "errors" + + "github.com/masa-finance/tee-types/args" teetypes "github.com/masa-finance/tee-types/types" "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/api" @@ -95,7 +98,7 @@ var _ = Describe("API", func() { Expect(encryptedResult).To(BeEmpty()) // The error should be about URL scheme validation - Expect(err.Error()).To(ContainSubstring("URL must include a scheme")) + Expect(errors.Is(err, args.ErrWebURLInvalid)).To(BeTrue()) }) It("should submit a job and get the correct result", func() { diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 3e3ced01..3402ee06 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -74,6 +74,8 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: fmt.Sprintf("error marshalling Web response")}, fmt.Errorf("error marshalling Web response: %w", err) } + // TODO is this where we add the LLM processor? + return types.JobResult{ Data: data, Job: j, From 14a5d5b3753592ade0a1c72c68949abee51e4a2a Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 05:17:18 +0200 Subject: [PATCH 04/43] fix: actually fixes api test suite --- Makefile | 7 +++++-- internal/api/api_test.go | 41 +++++----------------------------------- 2 files changed, 10 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 1417711c..26027bab 100644 --- a/Makefile +++ b/Makefile @@ -70,8 +70,11 @@ test: docker-build-test test-capabilities: docker-build-test @docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -coverprofile=coverage/coverage-capabilities.txt -covermode=atomic -v ./internal/capabilities +test-api: docker-build-test + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/api + test-jobs: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -coverprofile=coverage/coverage-jobs.txt -covermode=atomic -v ./internal/jobs + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs test-twitter: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/twitter_test.go ./internal/jobs/jobs_suite_test.go @@ -86,4 +89,4 @@ test-web: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/webscraper_test.go ./internal/jobs/jobs_suite_test.go test-telemetry: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go ./internal/jobs/jobs_suite_test.go + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go ./internal/jobs/jobs_suite_test.go \ No newline at end of file diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 006db298..a2e2caab 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -10,9 +10,6 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - "errors" - - "github.com/masa-finance/tee-types/args" teetypes "github.com/masa-finance/tee-types/types" "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/api" @@ -73,41 +70,15 @@ var _ = Describe("API", func() { cancel() }) - It("should submit an invalid job, and fail because of the malformed URL. no results containing google", func() { - // Step 1: Create the job request - job := types.Job{ - Type: teetypes.WebJob, - Arguments: map[string]interface{}{ - "url": "google", - }, - } - - // Step 2: Get a Job signature - jobSignature, err := clientInstance.CreateJobSignature(job) - Expect(err).NotTo(HaveOccurred()) - Expect(jobSignature).NotTo(BeEmpty()) - - // Step 3: Submit the job - jobResult, err := clientInstance.SubmitJob(jobSignature) - Expect(err).NotTo(HaveOccurred()) - Expect(jobResult.UUID).NotTo(BeEmpty()) - - // Step 4: Wait for the job result - should fail due to invalid URL - encryptedResult, err := jobResult.Get() - Expect(err).To(HaveOccurred()) - Expect(encryptedResult).To(BeEmpty()) - - // The error should be about URL scheme validation - Expect(errors.Is(err, args.ErrWebURLInvalid)).To(BeTrue()) - }) - It("should submit a job and get the correct result", func() { // Step 1: Create the job request + // we use TikTok transcription here as it's supported by all workers without any unique config job := types.Job{ - Type: teetypes.WebJob, + Type: teetypes.TiktokJob, Arguments: map[string]interface{}{ - "url": "https://google.com", - "depth": 1, + "type": "transcription", + "video_url": "https://www.tiktok.com/@.jake.ai/video/7516694182245813509", + "language": "eng-US", }, } // Step 2: Get a Job signature @@ -130,12 +101,10 @@ var _ = Describe("API", func() { Expect(err).NotTo(HaveOccurred()) Expect(decryptedResult).NotTo(BeEmpty()) - Expect(decryptedResult).To(ContainSubstring("google")) result, err := jobResult.GetDecrypted(jobSignature) Expect(err).NotTo(HaveOccurred()) Expect(result).NotTo(BeEmpty()) - Expect(result).To(ContainSubstring("google")) }) It("bubble up errors", func() { From e92cba713b4d7c967eb9166090f0c8759e1b3df4 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 05:56:45 +0200 Subject: [PATCH 05/43] fix: web tests --- Makefile | 6 +- internal/jobs/webapify/client_test.go | 200 ++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 3 deletions(-) create mode 100644 internal/jobs/webapify/client_test.go diff --git a/Makefile b/Makefile index 26027bab..81860607 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ test: docker-build-test docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -coverprofile=coverage/coverage.txt -covermode=atomic -v $(TEST_ARGS) test-capabilities: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -coverprofile=coverage/coverage-capabilities.txt -covermode=atomic -v ./internal/capabilities + @docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -v ./internal/capabilities test-api: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/api @@ -86,7 +86,7 @@ test-reddit: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/reddit_test.go ./internal/jobs/redditapify/client_test.go ./api/types/reddit/reddit_suite_test.go test-web: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/webscraper_test.go ./internal/jobs/jobs_suite_test.go + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/web_test.go ./internal/jobs/webapify/client_test.go ./internal/jobs/jobs_suite_test.go test-telemetry: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go ./internal/jobs/jobs_suite_test.go \ No newline at end of file + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go \ No newline at end of file diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go new file mode 100644 index 00000000..8d5cf802 --- /dev/null +++ b/internal/jobs/webapify/client_test.go @@ -0,0 +1,200 @@ +package webapify_test + +import ( + "encoding/json" + "errors" + "os" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/internal/jobs/webapify" + "github.com/masa-finance/tee-worker/pkg/client" + + teeargs "github.com/masa-finance/tee-types/args" +) + +// MockApifyClient is a mock implementation of the ApifyClient. +type MockApifyClient struct { + RunActorAndGetResponseFunc func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) + ValidateApiKeyFunc func() error +} + +func (m *MockApifyClient) RunActorAndGetResponse(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + if m.RunActorAndGetResponseFunc != nil { + return m.RunActorAndGetResponseFunc(actorID, input, cursor, limit) + } + return nil, "", errors.New("RunActorAndGetResponseFunc not defined") +} + +func (m *MockApifyClient) ValidateApiKey() error { + if m.ValidateApiKeyFunc != nil { + return m.ValidateApiKeyFunc() + } + return errors.New("ValidateApiKeyFunc not defined") +} + +var _ = Describe("WebApifyClient", func() { + var ( + mockClient *MockApifyClient + webClient *webapify.WebApifyClient + apifyKey string + ) + + BeforeEach(func() { + apifyKey = os.Getenv("APIFY_API_KEY") + mockClient = &MockApifyClient{} + // Replace the client creation function with one that returns the mock + webapify.NewInternalClient = func(apiKey string) (client.Apify, error) { + return mockClient, nil + } + var err error + webClient, err = webapify.NewClient("test-token", nil) + Expect(err).NotTo(HaveOccurred()) + }) + + Describe("Scrape", func() { + It("should construct the correct actor input", func() { + args := teeargs.WebArguments{ + URL: "https://example.com", + MaxDepth: 1, + MaxPages: 2, + } + + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + Expect(actorID).To(Equal(webapify.WebActorID)) + Expect(limit).To(Equal(uint(2))) + return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil + } + + _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should handle errors from the apify client", func() { + expectedErr := errors.New("apify error") + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return nil, "", expectedErr + } + + args := teeargs.WebArguments{ + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 1, + } + _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + Expect(err).To(MatchError(expectedErr)) + }) + + It("should handle JSON unmarshalling errors gracefully", func() { + invalidJSON := []byte(`{"url": "test", "markdown": 123}`) // markdown should be a string + dataset := &client.DatasetResponse{ + Data: client.ApifyDatasetData{ + Items: []json.RawMessage{invalidJSON}, + }, + } + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return dataset, "next", nil + } + + args := teeargs.WebArguments{ + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 1, + } + results, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(results).To(BeEmpty()) // The invalid item should be skipped + }) + + It("should correctly unmarshal valid items", func() { + webResultJSON, _ := json.Marshal(map[string]any{ + "url": "https://example.com", + "markdown": "# Hello World", + "title": "Example", + }) + dataset := &client.DatasetResponse{ + Data: client.ApifyDatasetData{ + Items: []json.RawMessage{webResultJSON}, + }, + } + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return dataset, "next", nil + } + + args := teeargs.WebArguments{ + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 1, + } + results, cursor, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(cursor).To(Equal(client.Cursor("next"))) + Expect(results).To(HaveLen(1)) + Expect(results[0].URL).To(Equal("https://example.com")) + Expect(results[0].Markdown).To(Equal("# Hello World")) + }) + }) + + Describe("ValidateApiKey", func() { + It("should validate the API key", func() { + mockClient.ValidateApiKeyFunc = func() error { + return nil + } + Expect(webClient.ValidateApiKey()).To(Succeed()) + }) + + It("should return error when validation fails", func() { + expectedErr := errors.New("invalid key") + mockClient.ValidateApiKeyFunc = func() error { + return expectedErr + } + Expect(webClient.ValidateApiKey()).To(MatchError(expectedErr)) + }) + }) + + // Integration tests that use the real client + Context("Integration tests", func() { + It("should validate API key with real client when APIFY_API_KEY is set", func() { + if apifyKey == "" { + Skip("APIFY_API_KEY is not set") + } + + // Reset to use real client + webapify.NewInternalClient = func(apiKey string) (client.Apify, error) { + return client.NewApifyClient(apiKey) + } + + realClient, err := webapify.NewClient(apifyKey, nil) + Expect(err).NotTo(HaveOccurred()) + Expect(realClient.ValidateApiKey()).To(Succeed()) + }) + + It("should scrape a real URL when APIFY_API_KEY is set", func() { + if apifyKey == "" { + Skip("APIFY_API_KEY is not set") + } + + // Reset to use real client + webapify.NewInternalClient = func(apiKey string) (client.Apify, error) { + return client.NewApifyClient(apiKey) + } + + realClient, err := webapify.NewClient(apifyKey, nil) + Expect(err).NotTo(HaveOccurred()) + + args := teeargs.WebArguments{ + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 1, + } + + results, cursor, err := realClient.Scrape("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(results).NotTo(BeEmpty()) + Expect(results[0]).NotTo(BeNil()) + Expect(results[0].URL).To(Equal("https://example.com")) + Expect(cursor).NotTo(BeEmpty()) + }) + }) +}) From ba5cdc2e172d2771a96b85862491f10ed75da6f6 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 06:22:44 +0200 Subject: [PATCH 06/43] fix: web tests --- Makefile | 3 +- internal/jobs/web_test.go | 74 +++++++++++++++++++ internal/jobs/webapify/client_test.go | 2 +- internal/jobs/webapify/webapify_suite_test.go | 13 ++++ 4 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 internal/jobs/webapify/webapify_suite_test.go diff --git a/Makefile b/Makefile index 81860607..5e51a264 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,8 @@ test-reddit: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/reddit_test.go ./internal/jobs/redditapify/client_test.go ./api/types/reddit/reddit_suite_test.go test-web: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/web_test.go ./internal/jobs/webapify/client_test.go ./internal/jobs/jobs_suite_test.go + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/web_test.go ./internal/jobs/jobs_suite_test.go" + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/webapify" test-telemetry: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go \ No newline at end of file diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 598f7c99..7d15956f 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -3,6 +3,7 @@ package jobs_test import ( "encoding/json" "errors" + "os" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -11,6 +12,7 @@ import ( "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" "github.com/masa-finance/tee-worker/internal/jobs/stats" + "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" teeargs "github.com/masa-finance/tee-types/args" @@ -124,4 +126,76 @@ var _ = Describe("WebScraper", func() { Expect(result.Error).To(Equal("error while scraping Web")) }) }) + + // Integration tests that use the real client + Context("Integration tests", func() { + var ( + apifyKey string + geminiKey string + ) + + BeforeEach(func() { + apifyKey = os.Getenv("APIFY_API_KEY") + geminiKey = os.Getenv("GEMINI_API_KEY") + + // Reset to use real client for integration tests + jobs.NewWebApifyClient = func(apiKey string, s *stats.StatsCollector) (jobs.WebApifyClient, error) { + return webapify.NewClient(apiKey, s) + } + }) + + It("should execute a real web scraping job when APIFY_API_KEY is set", func() { + if apifyKey == "" { + Skip("APIFY_API_KEY is not set") + } + + cfg := config.JobConfiguration{ + "apify_api_key": apifyKey, + "gemini_api_key": geminiKey, + } + integrationStatsCollector := stats.StartCollector(128, cfg) + integrationScraper := jobs.NewWebScraper(cfg, integrationStatsCollector) + + job := types.Job{ + UUID: "integration-test-uuid", + Type: teetypes.WebJob, + Arguments: map[string]any{ + "type": teetypes.WebScraper, + "url": "https://example.com", + "max_depth": 0, + "max_pages": 1, + }, + } + + result, err := integrationScraper.ExecuteJob(job) + Expect(err).NotTo(HaveOccurred()) + Expect(result.Error).To(BeEmpty()) + Expect(result.Data).NotTo(BeEmpty()) + + var resp []*teetypes.WebScraperResult + err = json.Unmarshal(result.Data, &resp) + Expect(err).NotTo(HaveOccurred()) + Expect(resp).NotTo(BeEmpty()) + Expect(resp[0]).NotTo(BeNil()) + Expect(resp[0].URL).To(Equal("https://example.com/")) + }) + + It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { + cfg := config.JobConfiguration{ + "apify_api_key": apifyKey, + "gemini_api_key": geminiKey, + } + integrationStatsCollector := stats.StartCollector(128, cfg) + integrationScraper := jobs.NewWebScraper(cfg, integrationStatsCollector) + + caps := integrationScraper.GetStructuredCapabilities() + if apifyKey != "" && geminiKey != "" { + Expect(caps[teetypes.WebJob]).NotTo(BeEmpty()) + } else { + // Expect no capabilities when either key is missing + _, ok := caps[teetypes.WebJob] + Expect(ok).To(BeFalse()) + } + }) + }) }) diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index 8d5cf802..f34d8001 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -193,7 +193,7 @@ var _ = Describe("WebApifyClient", func() { Expect(err).NotTo(HaveOccurred()) Expect(results).NotTo(BeEmpty()) Expect(results[0]).NotTo(BeNil()) - Expect(results[0].URL).To(Equal("https://example.com")) + Expect(results[0].URL).To(Equal("https://example.com/")) Expect(cursor).NotTo(BeEmpty()) }) }) diff --git a/internal/jobs/webapify/webapify_suite_test.go b/internal/jobs/webapify/webapify_suite_test.go new file mode 100644 index 00000000..285fb346 --- /dev/null +++ b/internal/jobs/webapify/webapify_suite_test.go @@ -0,0 +1,13 @@ +package webapify_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestWebApifyClient(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "WebApify Client Suite") +} From 402cdd87f2a87e21196896ab83bab2c4cf9b51dc Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 15:56:06 +0200 Subject: [PATCH 07/43] chore: fix capabilities test --- internal/capabilities/detector_test.go | 37 ++++++++++++++++++++------ internal/jobs/web_test.go | 3 +++ 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index 40b71d40..0df6c14f 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -8,8 +8,8 @@ import ( . "github.com/onsi/gomega" teetypes "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-worker/internal/config" . "github.com/masa-finance/tee-worker/internal/capabilities" + "github.com/masa-finance/tee-worker/internal/config" ) // MockJobServer implements JobServerInterface for testing @@ -65,7 +65,6 @@ var _ = Describe("DetectCapabilities", func() { config.JobConfiguration{}, nil, teetypes.WorkerCapabilities{ - teetypes.WebJob: {teetypes.CapScraper}, teetypes.TelemetryJob: {teetypes.CapTelemetry}, teetypes.TiktokJob: {teetypes.CapTranscription}, }, @@ -76,7 +75,6 @@ var _ = Describe("DetectCapabilities", func() { }, nil, teetypes.WorkerCapabilities{ - teetypes.WebJob: {teetypes.CapScraper}, teetypes.TelemetryJob: {teetypes.CapTelemetry}, teetypes.TiktokJob: {teetypes.CapTranscription}, teetypes.TwitterCredentialJob: teetypes.TwitterCredentialCaps, @@ -89,7 +87,6 @@ var _ = Describe("DetectCapabilities", func() { }, nil, teetypes.WorkerCapabilities{ - teetypes.WebJob: {teetypes.CapScraper}, teetypes.TelemetryJob: {teetypes.CapTelemetry}, teetypes.TiktokJob: {teetypes.CapTranscription}, teetypes.TwitterApiJob: teetypes.TwitterAPICaps, @@ -102,7 +99,6 @@ var _ = Describe("DetectCapabilities", func() { }, nil, teetypes.WorkerCapabilities{ - teetypes.WebJob: {teetypes.CapScraper}, teetypes.TelemetryJob: {teetypes.CapTelemetry}, teetypes.TiktokJob: {teetypes.CapTranscription}, // Note: Mock elevated keys will be detected as basic since we can't make real API calls in tests @@ -133,19 +129,19 @@ var _ = Describe("DetectCapabilities", func() { }, Entry("Basic scrapers only", config.JobConfiguration{}, - []string{"web", "telemetry", "tiktok"}, + []string{"telemetry", "tiktok"}, ), Entry("With Twitter accounts", config.JobConfiguration{ "twitter_accounts": []string{"user1:pass1"}, }, - []string{"web", "telemetry", "tiktok", "twitter", "twitter-credential"}, + []string{"telemetry", "tiktok", "twitter", "twitter-credential"}, ), Entry("With Twitter API keys", config.JobConfiguration{ "twitter_api_keys": []string{"key1"}, }, - []string{"web", "telemetry", "tiktok", "twitter", "twitter-api"}, + []string{"telemetry", "tiktok", "twitter", "twitter-api"}, ), ) }) @@ -179,6 +175,31 @@ var _ = Describe("DetectCapabilities", func() { _, hasReddit := caps[teetypes.RedditJob] Expect(hasReddit).To(BeTrue(), "expected reddit capabilities to be present") }) + It("should add enhanced capabilities when valid Apify API key is provided alongside a Gemini API key", func() { + apifyKey := os.Getenv("APIFY_API_KEY") + if apifyKey == "" { + Skip("APIFY_API_KEY is not set") + } + + geminiKey := os.Getenv("GEMINI_API_KEY") + if geminiKey == "" { + Skip("GEMINI_API_KEY is not set") + } + + jc := config.JobConfiguration{ + "apify_api_key": apifyKey, + "gemini_api_key": geminiKey, + } + caps := DetectCapabilities(jc, nil) + + // Web should be present + _, hasWeb := caps[teetypes.WebJob] + Expect(hasWeb).To(BeTrue(), "expected web capabilities to be present") + + // LLM should be present + _, hasLLM := caps[teetypes.LLMJob] + Expect(hasLLM).To(BeTrue(), "expected LLM capabilities to be present") + }) }) }) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 7d15956f..e46015fe 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -178,6 +178,9 @@ var _ = Describe("WebScraper", func() { Expect(resp).NotTo(BeEmpty()) Expect(resp[0]).NotTo(BeNil()) Expect(resp[0].URL).To(Equal("https://example.com/")) + + // TODO verify stats are increased via the client + }) It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { From 1fab0af990397bfdd1c29a15c51dd70fe7a0f79f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 21:34:43 +0200 Subject: [PATCH 08/43] feat: adds llm client and tests --- Makefile | 3 + go.mod | 2 +- go.sum | 2 + internal/jobs/llmapify/client.go | 92 ++++++ internal/jobs/llmapify/client_test.go | 272 ++++++++++++++++++ internal/jobs/llmapify/llmapify_suite_test.go | 13 + internal/jobs/stats/stats.go | 3 + 7 files changed, 386 insertions(+), 1 deletion(-) create mode 100644 internal/jobs/llmapify/client.go create mode 100644 internal/jobs/llmapify/client_test.go create mode 100644 internal/jobs/llmapify/llmapify_suite_test.go diff --git a/Makefile b/Makefile index 5e51a264..a695ab73 100644 --- a/Makefile +++ b/Makefile @@ -89,5 +89,8 @@ test-web: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/web_test.go ./internal/jobs/jobs_suite_test.go" @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/webapify" +test-llm: docker-build-test + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/llmapify" + test-telemetry: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go \ No newline at end of file diff --git a/go.mod b/go.mod index 448d4ca6..354f01e4 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 // FIXME: replace when new version is released - github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06 + github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 630512d4..07e8597b 100644 --- a/go.sum +++ b/go.sum @@ -61,6 +61,8 @@ github.com/masa-finance/tee-types v1.1.13 h1:bVXUEF8nXT3bhJE4kcDwcuzfQopid9BbIp0 github.com/masa-finance/tee-types v1.1.13/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06 h1:VPJyZ5M55OjEObOyQq330xBLJ8eyHSfDAQaA7ZC9vec= github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c h1:W0/19aE993pG31cqBzdxvJ5WwNmjtWs8nLC6ZTvtLO4= +github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go new file mode 100644 index 00000000..9a793780 --- /dev/null +++ b/internal/jobs/llmapify/client.go @@ -0,0 +1,92 @@ +package llmapify + +import ( + "encoding/json" + "errors" + "fmt" + + teeargs "github.com/masa-finance/tee-types/args" + teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/internal/jobs/stats" + "github.com/masa-finance/tee-worker/pkg/client" + "github.com/sirupsen/logrus" +) + +const ( + LLMActorID = "dusan.vystrcil~llm-dataset-processor" +) + +var ( + ErrLlmProviderKeyRequired = errors.New("llm provider key is required") + ErrFailedToCreateApifyClient = errors.New("failed to create apify client") +) + +type LLMApifyClient struct { + apifyClient client.Apify + statsCollector *stats.StatsCollector + llmProviderKey string +} + +// NewInternalClient is a function variable that can be replaced in tests. +// It defaults to the actual implementation. +var NewInternalClient = func(apiKey string) (client.Apify, error) { + return client.NewApifyClient(apiKey) +} + +// NewClient creates a new LLM Apify client +func NewClient(apiToken string, llmProviderKey string, statsCollector *stats.StatsCollector) (*LLMApifyClient, error) { + apifyClient, err := NewInternalClient(apiToken) + if err != nil { + return nil, fmt.Errorf("%w: %v", ErrFailedToCreateApifyClient, err) + } + + if llmProviderKey == "" { + return nil, ErrLlmProviderKeyRequired + } + + return &LLMApifyClient{ + apifyClient: apifyClient, + statsCollector: statsCollector, + llmProviderKey: llmProviderKey, + }, nil +} + +// ValidateApiKey tests if the Apify API token is valid +func (c *LLMApifyClient) ValidateApiKey() error { + return c.apifyClient.ValidateApiKey() +} + +func (c *LLMApifyClient) Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { + if c.statsCollector != nil { + c.statsCollector.Add(workerID, stats.LLMQueries, 1) + } + + input := args.ToLLMProcessorRequest() + input.LLMProviderApiKey = c.llmProviderKey + + limit := uint(1) // TODO, verify you can only ever operate on one dataset at a time + dataset, nextCursor, err := c.apifyClient.RunActorAndGetResponse(LLMActorID, input, cursor, limit) + if err != nil { + if c.statsCollector != nil { + c.statsCollector.Add(workerID, stats.LLMErrors, 1) + } + return nil, client.EmptyCursor, err + } + + response := make([]*teetypes.LLMProcessorResult, 0, len(dataset.Data.Items)) + + for i, item := range dataset.Data.Items { + var resp teetypes.LLMProcessorResult + if err := json.Unmarshal(item, &resp); err != nil { + logrus.Warnf("Failed to unmarshal llm result at index %d: %v", i, err) + continue + } + response = append(response, &resp) + } + + if c.statsCollector != nil { + c.statsCollector.Add(workerID, stats.LLMProcessedItems, uint(len(response))) + } + + return response, nextCursor, nil +} diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go new file mode 100644 index 00000000..0d960f19 --- /dev/null +++ b/internal/jobs/llmapify/client_test.go @@ -0,0 +1,272 @@ +package llmapify_test + +import ( + "encoding/json" + "errors" + "fmt" + "os" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/internal/jobs/llmapify" + "github.com/masa-finance/tee-worker/pkg/client" + + teeargs "github.com/masa-finance/tee-types/args" + teetypes "github.com/masa-finance/tee-types/types" +) + +// MockApifyClient is a mock implementation of the ApifyClient. +type MockApifyClient struct { + RunActorAndGetResponseFunc func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) + ValidateApiKeyFunc func() error +} + +func (m *MockApifyClient) RunActorAndGetResponse(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + if m.RunActorAndGetResponseFunc != nil { + return m.RunActorAndGetResponseFunc(actorID, input, cursor, limit) + } + return nil, "", errors.New("RunActorAndGetResponseFunc not defined") +} + +func (m *MockApifyClient) ValidateApiKey() error { + if m.ValidateApiKeyFunc != nil { + return m.ValidateApiKeyFunc() + } + return errors.New("ValidateApiKeyFunc not defined") +} + +var _ = Describe("LLMApifyClient", func() { + var ( + mockClient *MockApifyClient + llmClient *llmapify.LLMApifyClient + apifyKey string + ) + + BeforeEach(func() { + apifyKey = os.Getenv("APIFY_API_KEY") + mockClient = &MockApifyClient{} + // Replace the client creation function with one that returns the mock + llmapify.NewInternalClient = func(apiKey string) (client.Apify, error) { + return mockClient, nil + } + var err error + llmClient, err = llmapify.NewClient("test-token", "test-llm-key", nil) + Expect(err).NotTo(HaveOccurred()) + }) + + Describe("Process", func() { + It("should construct the correct actor input", func() { + args := teeargs.LLMProcessorArguments{ + DatasetId: "test-dataset-id", + Prompt: "test-prompt", + } + + // Marshal and unmarshal to apply defaults + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + Expect(actorID).To(Equal(llmapify.LLMActorID)) + Expect(limit).To(Equal(uint(1))) + + // Verify the input is correctly converted to LLMProcessorRequest + request, ok := input.(teetypes.LLMProcessorRequest) + Expect(ok).To(BeTrue()) + Expect(request.InputDatasetId).To(Equal("test-dataset-id")) + Expect(request.Prompt).To(Equal("test-prompt")) + Expect(request.LLMProviderApiKey).To(Equal("test-llm-key")) // should be set from constructor + Expect(request.Model).To(Equal(teeargs.LLMDefaultModel)) // default model + Expect(request.MultipleColumns).To(Equal(teeargs.LLMDefaultMultipleColumns)) // default value + Expect(request.MaxTokens).To(Equal(teeargs.LLMDefaultMaxTokens)) // default value + Expect(request.Temperature).To(Equal(teeargs.LLMDefaultTemperature)) // default value + + return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil + } + + _, _, processErr := llmClient.Process("test-worker", args, client.EmptyCursor) + Expect(processErr).NotTo(HaveOccurred()) + }) + + It("should handle errors from the apify client", func() { + expectedErr := errors.New("apify error") + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return nil, "", expectedErr + } + + args := teeargs.LLMProcessorArguments{ + DatasetId: "test-dataset-id", + Prompt: "test-prompt", + } + _, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + Expect(err).To(MatchError(expectedErr)) + }) + + It("should handle JSON unmarshalling errors gracefully", func() { + invalidJSON := []byte(`{"llmresponse": 123}`) // llmresponse should be a string + dataset := &client.DatasetResponse{ + Data: client.ApifyDatasetData{ + Items: []json.RawMessage{invalidJSON}, + }, + } + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return dataset, "next", nil + } + + args := teeargs.LLMProcessorArguments{ + DatasetId: "test-dataset-id", + Prompt: "test-prompt", + } + results, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(results).To(BeEmpty()) // The invalid item should be skipped + }) + + It("should correctly unmarshal valid items", func() { + llmResultJSON, _ := json.Marshal(map[string]any{ + "llmresponse": "This is a summary of the webpage content.", + }) + dataset := &client.DatasetResponse{ + Data: client.ApifyDatasetData{ + Items: []json.RawMessage{llmResultJSON}, + }, + } + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return dataset, "next", nil + } + + args := teeargs.LLMProcessorArguments{ + DatasetId: "test-dataset-id", + Prompt: "test-prompt", + } + results, cursor, err := llmClient.Process("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(cursor).To(Equal(client.Cursor("next"))) + Expect(results).To(HaveLen(1)) + Expect(results[0].LLMResponse).To(Equal("This is a summary of the webpage content.")) + }) + + It("should handle multiple valid results", func() { + llmResult1, _ := json.Marshal(map[string]any{ + "llmresponse": "First summary.", + }) + llmResult2, _ := json.Marshal(map[string]any{ + "llmresponse": "Second summary.", + }) + dataset := &client.DatasetResponse{ + Data: client.ApifyDatasetData{ + Items: []json.RawMessage{llmResult1, llmResult2}, + }, + } + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + return dataset, "next", nil + } + + args := teeargs.LLMProcessorArguments{ + DatasetId: "test-dataset-id", + Prompt: "test-prompt", + } + results, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(results).To(HaveLen(2)) + Expect(results[0].LLMResponse).To(Equal("First summary.")) + Expect(results[1].LLMResponse).To(Equal("Second summary.")) + }) + + It("should use custom values when provided", func() { + args := teeargs.LLMProcessorArguments{ + DatasetId: "test-dataset-id", + Prompt: "test-prompt", + MaxTokens: 500, + Temperature: "0.5", + } + + mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { + request, ok := input.(teetypes.LLMProcessorRequest) + Expect(ok).To(BeTrue()) + Expect(request.MaxTokens).To(Equal(500)) + Expect(request.Temperature).To(Equal("0.5")) + Expect(request.LLMProviderApiKey).To(Equal("test-llm-key")) // should be set from constructor + + return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil + } + + _, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Describe("ValidateApiKey", func() { + It("should validate the API key", func() { + mockClient.ValidateApiKeyFunc = func() error { + return nil + } + Expect(llmClient.ValidateApiKey()).To(Succeed()) + }) + + It("should return error when validation fails", func() { + expectedErr := errors.New("invalid key") + mockClient.ValidateApiKeyFunc = func() error { + return expectedErr + } + Expect(llmClient.ValidateApiKey()).To(MatchError(expectedErr)) + }) + }) + + // Integration tests that use the real client + Context("Integration tests", func() { + It("should validate API key with real client when both APIFY_API_KEY and GEMINI_API_KEY are set", func() { + geminiKey := os.Getenv("GEMINI_API_KEY") + if apifyKey == "" || geminiKey == "" { + Skip("Both APIFY_API_KEY and GEMINI_API_KEY must be set for integration tests") + } + + // Reset to use real client + llmapify.NewInternalClient = func(apiKey string) (client.Apify, error) { + return client.NewApifyClient(apiKey) + } + + realClient, err := llmapify.NewClient(apifyKey, geminiKey, nil) + Expect(err).NotTo(HaveOccurred()) + Expect(realClient.ValidateApiKey()).To(Succeed()) + }) + + It("should process a real dataset when both APIFY_API_KEY and GEMINI_API_KEY are set", func() { + geminiKey := os.Getenv("GEMINI_API_KEY") + if apifyKey == "" || geminiKey == "" { + Skip("Both APIFY_API_KEY and GEMINI_API_KEY must be set for integration tests") + } + + // Reset to use real client + llmapify.NewInternalClient = func(apiKey string) (client.Apify, error) { + return client.NewApifyClient(apiKey) + } + + realClient, err := llmapify.NewClient(apifyKey, geminiKey, nil) + Expect(err).NotTo(HaveOccurred()) + + args := teeargs.LLMProcessorArguments{ + DatasetId: "V6tyuuZIgfiETl1cl", + Prompt: "summarize the content of this webpage ${markdown}", + } + // Marshal and unmarshal to apply defaults + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + + results, cursor, err := realClient.Process("test-worker", args, client.EmptyCursor) + Expect(err).NotTo(HaveOccurred()) + Expect(results).NotTo(BeEmpty()) + Expect(results[0]).NotTo(BeNil()) + Expect(results[0].LLMResponse).NotTo(BeEmpty()) + Expect(cursor).NotTo(BeEmpty()) + + prettyJSON, err := json.MarshalIndent(results, "", " ") + Expect(err).NotTo(HaveOccurred()) + fmt.Println(string(prettyJSON)) + }) + }) +}) diff --git a/internal/jobs/llmapify/llmapify_suite_test.go b/internal/jobs/llmapify/llmapify_suite_test.go new file mode 100644 index 00000000..1533e4e1 --- /dev/null +++ b/internal/jobs/llmapify/llmapify_suite_test.go @@ -0,0 +1,13 @@ +package llmapify_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestWebApifyClient(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "WebApify Client Suite") +} diff --git a/internal/jobs/stats/stats.go b/internal/jobs/stats/stats.go index 44946460..5c107ab1 100644 --- a/internal/jobs/stats/stats.go +++ b/internal/jobs/stats/stats.go @@ -28,6 +28,9 @@ const ( WebQueries StatType = "web_queries" WebScrapedPages StatType = "web_scraped_pages" WebErrors StatType = "web_errors" + LLMQueries StatType = "llm_queries" + LLMProcessedItems StatType = "llm_processed_items" + LLMErrors StatType = "llm_errors" TikTokTranscriptionSuccess StatType = "tiktok_transcription_success" TikTokTranscriptionErrors StatType = "tiktok_transcription_errors" TikTokVideos StatType = "tiktok_returned_videos" From 6babf81f1d4b06279feb51be129b1c32ee24ab46 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 21:39:19 +0200 Subject: [PATCH 09/43] fix: tiktok test --- internal/jobs/tiktok_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/tiktok_test.go b/internal/jobs/tiktok_test.go index c4f63730..016dfd02 100644 --- a/internal/jobs/tiktok_test.go +++ b/internal/jobs/tiktok_test.go @@ -44,7 +44,7 @@ var _ = Describe("TikTok", func() { Context("when a valid TikTok URL is provided", func() { It("should successfully transcribe the video and record success stats", func(ctx SpecContext) { - videoURL := "https://www.tiktok.com/@.jake.ai/video/7516694182245813509" + videoURL := "https://www.tiktok.com/@theblockrunner.com/video/7227579907361066282" jobArguments := map[string]interface{}{ "type": teetypes.CapTranscription, "video_url": videoURL, From 4bb59b90d1c7ef31c344b4d2b77274c18bc22070 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 21:43:48 +0200 Subject: [PATCH 10/43] fix: update masa tee worker json with gemini key --- tee/masa-tee-worker.json | 1 + 1 file changed, 1 insertion(+) diff --git a/tee/masa-tee-worker.json b/tee/masa-tee-worker.json index ef2a245c..00678377 100644 --- a/tee/masa-tee-worker.json +++ b/tee/masa-tee-worker.json @@ -39,6 +39,7 @@ {"name": "TWITTER_ACCOUNTS", "fromHost":true}, {"name": "TWITTER_API_KEYS", "fromHost":true}, {"name": "APIFY_API_KEY", "fromHost":true}, + {"name": "GEMINI_API_KEY", "fromHost":true}, {"name": "TWITTER_SKIP_LOGIN_VERIFICATION", "fromHost":true}, {"name": "WEBSCRAPER_BLACKLIST", "fromHost":true} ], From ac2a249c93c7e03a4ed71c7222d9802e6ee6d720 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 21:53:16 +0200 Subject: [PATCH 11/43] fix: updates capabilities without explicity llm --- go.mod | 2 +- go.sum | 2 ++ internal/capabilities/detector.go | 9 ++++----- internal/capabilities/detector_test.go | 4 ---- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 354f01e4..bb12fa25 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 // FIXME: replace when new version is released - github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c + github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 07e8597b..251c1ff8 100644 --- a/go.sum +++ b/go.sum @@ -63,6 +63,8 @@ github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06 h1:VPJyZ github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c h1:W0/19aE993pG31cqBzdxvJ5WwNmjtWs8nLC6ZTvtLO4= github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4 h1:KmBzY5J3nf7bbvFNS/f2U0hGTZcwBrwsBahVXfj9uOI= +github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index b0514a5c..c2e1583f 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -43,7 +43,7 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface hasAccounts := len(accounts) > 0 hasApiKeys := len(apiKeys) > 0 hasApifyKey := hasValidApifyKey(apifyApiKey) - hasGeminiKey := hasValidGeminiKey(geminiApiKey) + hasLLMKey := hasValidLLMKey(geminiApiKey) // Add Twitter-specific capabilities based on available authentication if hasAccounts { @@ -75,9 +75,8 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface s.Add(teetypes.TiktokSearchCaps...) capabilities[teetypes.TiktokJob] = s.Items() - if hasGeminiKey { + if hasLLMKey { capabilities[teetypes.WebJob] = teetypes.WebCaps - capabilities[teetypes.LLMJob] = teetypes.LLMCaps } } @@ -164,8 +163,8 @@ func hasValidApifyKey(apifyApiKey string) bool { return true } -func hasValidGeminiKey(geminiApiKey string) bool { - if geminiApiKey == "" { +func hasValidLLMKey(llmApiKey string) bool { + if llmApiKey == "" { return false } diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index 0df6c14f..5b0bdb4e 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -195,10 +195,6 @@ var _ = Describe("DetectCapabilities", func() { // Web should be present _, hasWeb := caps[teetypes.WebJob] Expect(hasWeb).To(BeTrue(), "expected web capabilities to be present") - - // LLM should be present - _, hasLLM := caps[teetypes.LLMJob] - Expect(hasLLM).To(BeTrue(), "expected LLM capabilities to be present") }) }) }) From 47285c2218275e4d9204a93a266e10f0a4593cfd Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 22:21:26 +0200 Subject: [PATCH 12/43] chore: update tee types --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index bb12fa25..407ef4c0 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 // FIXME: replace when new version is released - github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4 + github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 251c1ff8..d1f6b30e 100644 --- a/go.sum +++ b/go.sum @@ -65,6 +65,8 @@ github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c h1:W0/19 github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4 h1:KmBzY5J3nf7bbvFNS/f2U0hGTZcwBrwsBahVXfj9uOI= github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8 h1:HxcGeqrBkTPP3UAQ9xO1yeYdFWdP1qAkuWzeyyz+Rj4= +github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= From c0221f662761b050c11da47e68eff38166f3ed2f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 22:50:22 +0200 Subject: [PATCH 13/43] feat: incorporate llm --- go.mod | 2 +- go.sum | 2 ++ internal/jobs/web.go | 55 +++++++++++++++++++++++++++++--- internal/jobs/web_test.go | 32 +++++++++++-------- internal/jobs/webapify/client.go | 6 ++-- pkg/client/apify_client.go | 6 +++- 6 files changed, 80 insertions(+), 23 deletions(-) diff --git a/go.mod b/go.mod index 407ef4c0..b76b5f3c 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 // FIXME: replace when new version is released - github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8 + github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index d1f6b30e..139f0bcd 100644 --- a/go.sum +++ b/go.sum @@ -67,6 +67,8 @@ github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4 h1:KmBzY github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8 h1:HxcGeqrBkTPP3UAQ9xO1yeYdFWdP1qAkuWzeyyz+Rj4= github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191 h1:zzd94bBcRhfUUqhzcgRDOUFwgJRlVR5rTUqAaocjfw4= +github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 3402ee06..1d525dd1 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -9,6 +9,7 @@ import ( "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" + "github.com/masa-finance/tee-worker/internal/jobs/llmapify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" @@ -19,7 +20,7 @@ import ( // WebApifyClient defines the interface for the Web Apify client to allow mocking in tests type WebApifyClient interface { - Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, client.Cursor, error) + Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) } // NewWebApifyClient is a function variable that can be replaced in tests. @@ -28,6 +29,17 @@ var NewWebApifyClient = func(apiKey string, statsCollector *stats.StatsCollector return webapify.NewClient(apiKey, statsCollector) } +// LLMApify is the interface for the LLM processor client +// Only the Process method is required for this flow +type LLMApify interface { + Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) +} + +// NewLLMApifyClient is a function variable to allow injection in tests +var NewLLMApifyClient = func(apiKey string, llmKey string, statsCollector *stats.StatsCollector) (LLMApify, error) { + return llmapify.NewClient(apiKey, llmKey, statsCollector) +} + type WebScraper struct { configuration config.WebConfig statsCollector *stats.StatsCollector @@ -47,6 +59,12 @@ func NewWebScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollec func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for Web scrape") + // Require Gemini key for LLM processing in Web flow + if w.configuration.GeminiApiKey == "" { + msg := errors.New("Gemini API key is required for Web job") + return types.JobResult{Error: msg.Error()}, msg + } + jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { msg := fmt.Errorf("failed to unmarshal job arguments: %w", err) @@ -64,18 +82,47 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: "error while scraping Web"}, fmt.Errorf("error creating Web Apify client: %w", err) } - resp, cursor, err := webClient.Scrape(j.WorkerID, *webArgs, client.EmptyCursor) + resp, datasetId, cursor, err := webClient.Scrape(j.WorkerID, *webArgs, client.EmptyCursor) if err != nil { return types.JobResult{Error: fmt.Sprintf("error while scraping Web: %s", err.Error())}, fmt.Errorf("error scraping Web: %w", err) } + // Run LLM processing and inject into results (Gemini key already validated) + if datasetId == "" { + return types.JobResult{Error: "missing dataset id from web scraping"}, errors.New("missing dataset id from web scraping") + } + + llmClient, err := NewLLMApifyClient(w.configuration.ApifyApiKey, w.configuration.GeminiApiKey, w.statsCollector) + if err != nil { + return types.JobResult{Error: "error creating LLM Apify client"}, fmt.Errorf("failed to create LLM Apify client: %w", err) + } + + llmArgs := teeargs.LLMProcessorArguments{ + DatasetId: datasetId, + Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", + MaxTokens: teeargs.LLMDefaultMaxTokens, + Temperature: teeargs.LLMDefaultTemperature, + } + llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) + if llmErr != nil { + return types.JobResult{Error: fmt.Sprintf("error while processing LLM: %s", llmErr.Error())}, fmt.Errorf("error processing LLM: %w", llmErr) + } + + max := len(resp) + if len(llmResp) < max { + max = len(llmResp) + } + for i := 0; i < max; i++ { + if resp[i] != nil { + resp[i].LLMResponse = llmResp[i].LLMResponse + } + } + data, err := json.Marshal(resp) if err != nil { return types.JobResult{Error: fmt.Sprintf("error marshalling Web response")}, fmt.Errorf("error marshalling Web response: %w", err) } - // TODO is this where we add the LLM processor? - return types.JobResult{ Data: data, Job: j, diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index e46015fe..06ba4919 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -3,6 +3,7 @@ package jobs_test import ( "encoding/json" "errors" + "fmt" "os" . "github.com/onsi/ginkgo/v2" @@ -21,15 +22,15 @@ import ( // MockWebApifyClient is a mock implementation of the WebApifyClient. type MockWebApifyClient struct { - ScrapeFunc func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, client.Cursor, error) + ScrapeFunc func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, string, client.Cursor, error) } -func (m *MockWebApifyClient) Scrape(_ string, args teeargs.WebArguments, _ client.Cursor) ([]*teetypes.WebScraperResult, client.Cursor, error) { +func (m *MockWebApifyClient) Scrape(_ string, args teeargs.WebArguments, _ client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { if m != nil && m.ScrapeFunc != nil { - res, next, err := m.ScrapeFunc(args) - return res, next, err + res, datasetId, next, err := m.ScrapeFunc(args) + return res, datasetId, next, err } - return nil, client.EmptyCursor, nil + return nil, "", client.EmptyCursor, nil } var _ = Describe("WebScraper", func() { @@ -75,14 +76,15 @@ var _ = Describe("WebScraper", func() { "max_pages": 2, } - mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, client.Cursor, error) { + mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { Expect(args.URL).To(Equal("https://example.com")) - return []*teetypes.WebScraperResult{{URL: "https://example.com", Markdown: "# Hello"}}, client.Cursor("next-cursor"), nil + return []*teetypes.WebScraperResult{{URL: "https://example.com", Markdown: "# Hello"}}, "dataset-123", client.Cursor("next-cursor"), nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-cursor")) + var resp []*teetypes.WebScraperResult err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) @@ -100,8 +102,8 @@ var _ = Describe("WebScraper", func() { } expectedErr := errors.New("client error") - mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, client.Cursor, error) { - return nil, client.EmptyCursor, expectedErr + mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { + return nil, "", client.EmptyCursor, expectedErr } result, err := scraper.ExecuteJob(job) @@ -144,7 +146,7 @@ var _ = Describe("WebScraper", func() { } }) - It("should execute a real web scraping job when APIFY_API_KEY is set", func() { + FIt("should execute a real web scraping job when APIFY_API_KEY is set", func() { if apifyKey == "" { Skip("APIFY_API_KEY is not set") } @@ -161,7 +163,7 @@ var _ = Describe("WebScraper", func() { Type: teetypes.WebJob, Arguments: map[string]any{ "type": teetypes.WebScraper, - "url": "https://example.com", + "url": "https://en.wikipedia.org/wiki/Bitcoin", "max_depth": 0, "max_pages": 1, }, @@ -177,10 +179,12 @@ var _ = Describe("WebScraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(resp).NotTo(BeEmpty()) Expect(resp[0]).NotTo(BeNil()) - Expect(resp[0].URL).To(Equal("https://example.com/")) - - // TODO verify stats are increased via the client + Expect(resp[0].URL).To(Equal("https://en.wikipedia.org/wiki/Bitcoin/")) + Expect(resp[0].LLMResponse).NotTo(BeEmpty()) + prettyJSON, err := json.MarshalIndent(resp, "", " ") + Expect(err).NotTo(HaveOccurred()) + fmt.Println(string(prettyJSON)) }) It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index ecd423f0..c40041da 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -44,7 +44,7 @@ func (c *WebApifyClient) ValidateApiKey() error { return c.apifyClient.ValidateApiKey() } -func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, client.Cursor, error) { +func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.WebQueries, 1) } @@ -59,7 +59,7 @@ func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, curs if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.WebErrors, 1) } - return nil, client.EmptyCursor, err + return nil, "", client.EmptyCursor, err } response := make([]*teetypes.WebScraperResult, 0, len(dataset.Data.Items)) @@ -77,5 +77,5 @@ func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, curs c.statsCollector.Add(workerID, stats.WebScrapedPages, uint(len(response))) } - return response, nextCursor, nil + return response, dataset.DatasetId, nextCursor, nil } diff --git a/pkg/client/apify_client.go b/pkg/client/apify_client.go index 6e0f615c..64413420 100644 --- a/pkg/client/apify_client.go +++ b/pkg/client/apify_client.go @@ -56,7 +56,8 @@ type ApifyDatasetData struct { // DatasetResponse represents the response from getting dataset items type DatasetResponse struct { - Data ApifyDatasetData `json:"data"` + Data ApifyDatasetData `json:"data"` + DatasetId string `json:"dataset_id"` } // CursorData represents the pagination data stored in cursor @@ -341,6 +342,9 @@ PollLoop: return nil, "", fmt.Errorf("failed to get dataset items: %w", err) } + // Propagate dataset id for downstream consumers + dataset.DatasetId = runResp.Data.DefaultDatasetId + // 4. Generate next cursor if more data may be available var nextCursor Cursor if uint(len(dataset.Data.Items)) == limit { From 9cc72853e80dd9359cf77f1ba4d791c1b6881461 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 22:53:38 +0200 Subject: [PATCH 14/43] fix: web test --- internal/jobs/web_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 06ba4919..a838993f 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -179,7 +179,7 @@ var _ = Describe("WebScraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(resp).NotTo(BeEmpty()) Expect(resp[0]).NotTo(BeNil()) - Expect(resp[0].URL).To(Equal("https://en.wikipedia.org/wiki/Bitcoin/")) + Expect(resp[0].URL).To(Equal("https://en.wikipedia.org/wiki/Bitcoin")) Expect(resp[0].LLMResponse).NotTo(BeEmpty()) prettyJSON, err := json.MarshalIndent(resp, "", " ") From 3d12a1f71f694cc0eff5a520c1b773cd31d9f71b Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 23:13:18 +0200 Subject: [PATCH 15/43] fix: web testing to integrate apify client --- internal/jobs/web_test.go | 53 +++++++++++++++++++++------ internal/jobs/webapify/client_test.go | 21 ++++++----- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index a838993f..4299dabb 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -3,7 +3,6 @@ package jobs_test import ( "encoding/json" "errors" - "fmt" "os" . "github.com/onsi/ginkgo/v2" @@ -12,6 +11,7 @@ import ( "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" + "github.com/masa-finance/tee-worker/internal/jobs/llmapify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" @@ -33,26 +33,54 @@ func (m *MockWebApifyClient) Scrape(_ string, args teeargs.WebArguments, _ clien return nil, "", client.EmptyCursor, nil } +// MockLLMApifyClient is a mock implementation of the LLMApify interface +// used to prevent external calls during unit tests. +type MockLLMApifyClient struct { + ProcessFunc func(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) +} + +func (m *MockLLMApifyClient) Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { + if m != nil && m.ProcessFunc != nil { + return m.ProcessFunc(workerID, args, cursor) + } + return []*teetypes.LLMProcessorResult{}, client.EmptyCursor, nil +} + var _ = Describe("WebScraper", func() { var ( scraper *jobs.WebScraper statsCollector *stats.StatsCollector job types.Job mockClient *MockWebApifyClient + mockLLM *MockLLMApifyClient ) + // Keep originals to restore after each test to avoid leaking globals + originalNewWebApifyClient := jobs.NewWebApifyClient + originalNewLLMApifyClient := jobs.NewLLMApifyClient + BeforeEach(func() { statsCollector = stats.StartCollector(128, config.JobConfiguration{}) cfg := config.JobConfiguration{ - "apify_api_key": "test-key", + "apify_api_key": "test-key", + "gemini_api_key": "test-gemini-key", } scraper = jobs.NewWebScraper(cfg, statsCollector) mockClient = &MockWebApifyClient{} + mockLLM = &MockLLMApifyClient{ + ProcessFunc: func(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { + // Return a single empty summary to avoid changing expectations + return []*teetypes.LLMProcessorResult{{LLMResponse: ""}}, client.EmptyCursor, nil + }, + } - // Replace the client creation function with one that returns the mock + // Replace the client creation function with one that returns the mocks jobs.NewWebApifyClient = func(apiKey string, _ *stats.StatsCollector) (jobs.WebApifyClient, error) { return mockClient, nil } + jobs.NewLLMApifyClient = func(apiKey string, llmKey string, _ *stats.StatsCollector) (jobs.LLMApify, error) { + return mockLLM, nil + } job = types.Job{ UUID: "test-uuid", @@ -60,6 +88,11 @@ var _ = Describe("WebScraper", func() { } }) + AfterEach(func() { + jobs.NewWebApifyClient = originalNewWebApifyClient + jobs.NewLLMApifyClient = originalNewLLMApifyClient + }) + Context("ExecuteJob", func() { It("should return an error for invalid arguments", func() { job.Arguments = map[string]any{"invalid": "args"} @@ -144,9 +177,12 @@ var _ = Describe("WebScraper", func() { jobs.NewWebApifyClient = func(apiKey string, s *stats.StatsCollector) (jobs.WebApifyClient, error) { return webapify.NewClient(apiKey, s) } + jobs.NewLLMApifyClient = func(apiKey string, llmKey string, s *stats.StatsCollector) (jobs.LLMApify, error) { + return llmapify.NewClient(apiKey, llmKey, s) + } }) - FIt("should execute a real web scraping job when APIFY_API_KEY is set", func() { + It("should execute a real web scraping job when APIFY_API_KEY is set", func() { if apifyKey == "" { Skip("APIFY_API_KEY is not set") } @@ -163,7 +199,7 @@ var _ = Describe("WebScraper", func() { Type: teetypes.WebJob, Arguments: map[string]any{ "type": teetypes.WebScraper, - "url": "https://en.wikipedia.org/wiki/Bitcoin", + "url": "https://example.com", "max_depth": 0, "max_pages": 1, }, @@ -179,12 +215,7 @@ var _ = Describe("WebScraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(resp).NotTo(BeEmpty()) Expect(resp[0]).NotTo(BeNil()) - Expect(resp[0].URL).To(Equal("https://en.wikipedia.org/wiki/Bitcoin")) - Expect(resp[0].LLMResponse).NotTo(BeEmpty()) - - prettyJSON, err := json.MarshalIndent(resp, "", " ") - Expect(err).NotTo(HaveOccurred()) - fmt.Println(string(prettyJSON)) + Expect(resp[0].URL).To(Equal("https://example.com/")) }) It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index f34d8001..e585fa89 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -39,10 +39,12 @@ var _ = Describe("WebApifyClient", func() { mockClient *MockApifyClient webClient *webapify.WebApifyClient apifyKey string + geminiKey string ) BeforeEach(func() { apifyKey = os.Getenv("APIFY_API_KEY") + geminiKey = os.Getenv("GEMINI_API_KEY") mockClient = &MockApifyClient{} // Replace the client creation function with one that returns the mock webapify.NewInternalClient = func(apiKey string) (client.Apify, error) { @@ -67,7 +69,7 @@ var _ = Describe("WebApifyClient", func() { return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } - _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + _, _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) }) @@ -82,7 +84,7 @@ var _ = Describe("WebApifyClient", func() { MaxDepth: 0, MaxPages: 1, } - _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + _, _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) Expect(err).To(MatchError(expectedErr)) }) @@ -102,7 +104,7 @@ var _ = Describe("WebApifyClient", func() { MaxDepth: 0, MaxPages: 1, } - results, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + results, _, _, err := webClient.Scrape("test-worker", args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(BeEmpty()) // The invalid item should be skipped }) @@ -127,7 +129,7 @@ var _ = Describe("WebApifyClient", func() { MaxDepth: 0, MaxPages: 1, } - results, cursor, err := webClient.Scrape("test-worker", args, client.EmptyCursor) + results, _, cursor, err := webClient.Scrape("test-worker", args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(cursor).To(Equal(client.Cursor("next"))) Expect(results).To(HaveLen(1)) @@ -156,8 +158,8 @@ var _ = Describe("WebApifyClient", func() { // Integration tests that use the real client Context("Integration tests", func() { It("should validate API key with real client when APIFY_API_KEY is set", func() { - if apifyKey == "" { - Skip("APIFY_API_KEY is not set") + if apifyKey == "" || geminiKey == "" { + Skip("APIFY_API_KEY and GEMINI_API_KEY required to run web integration tests") } // Reset to use real client @@ -171,8 +173,8 @@ var _ = Describe("WebApifyClient", func() { }) It("should scrape a real URL when APIFY_API_KEY is set", func() { - if apifyKey == "" { - Skip("APIFY_API_KEY is not set") + if apifyKey == "" || geminiKey == "" { + Skip("APIFY_API_KEY and GEMINI_API_KEY required to run web integration tests") } // Reset to use real client @@ -189,8 +191,9 @@ var _ = Describe("WebApifyClient", func() { MaxPages: 1, } - results, cursor, err := realClient.Scrape("test-worker", args, client.EmptyCursor) + results, datasetId, cursor, err := realClient.Scrape("test-worker", args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) + Expect(datasetId).NotTo(BeEmpty()) Expect(results).NotTo(BeEmpty()) Expect(results[0]).NotTo(BeNil()) Expect(results[0].URL).To(Equal("https://example.com/")) From b22cb10a1c83a32a4b658c2826b79257c264075b Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 23:29:36 +0200 Subject: [PATCH 16/43] fix: adds processed pages --- internal/jobs/stats/stats.go | 1 + internal/jobs/web.go | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/internal/jobs/stats/stats.go b/internal/jobs/stats/stats.go index 5c107ab1..ada7ab98 100644 --- a/internal/jobs/stats/stats.go +++ b/internal/jobs/stats/stats.go @@ -27,6 +27,7 @@ const ( TwitterXSearchQueries StatType = "twitterx_search" // TODO: investigate if this is needed or used... WebQueries StatType = "web_queries" WebScrapedPages StatType = "web_scraped_pages" + WebProcessedPages StatType = "web_processed_pages" WebErrors StatType = "web_errors" LLMQueries StatType = "llm_queries" LLMProcessedItems StatType = "llm_processed_items" diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 1d525dd1..13be96d1 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -123,6 +123,10 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: fmt.Sprintf("error marshalling Web response")}, fmt.Errorf("error marshalling Web response: %w", err) } + if w.statsCollector != nil { + w.statsCollector.Add(j.WorkerID, stats.WebProcessedPages, uint(len(llmResp))) + } + return types.JobResult{ Data: data, Job: j, From b0c7350c262bca6ee8590c557504d8587f201004 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 10 Sep 2025 03:02:18 +0200 Subject: [PATCH 17/43] chore: go mod tidy --- go.mod | 14 -------------- go.sum | 54 ------------------------------------------------------ 2 files changed, 68 deletions(-) diff --git a/go.mod b/go.mod index b76b5f3c..9f6506d1 100644 --- a/go.mod +++ b/go.mod @@ -5,9 +5,7 @@ go 1.23.0 toolchain go1.24.3 require ( - github.com/cenkalti/backoff v2.2.1+incompatible github.com/edgelesssys/ego v1.7.2 - github.com/gocolly/colly v1.2.0 github.com/google/uuid v1.6.0 github.com/imperatrona/twitter-scraper v0.0.18 github.com/joho/godotenv v1.5.1 @@ -29,24 +27,13 @@ require ( ) require ( - github.com/PuerkitoBio/goquery v1.10.3 // indirect - github.com/andybalholm/cascadia v1.3.3 // indirect - github.com/antchfx/htmlquery v1.3.4 // indirect - github.com/antchfx/xmlquery v1.4.4 // indirect - github.com/antchfx/xpath v1.3.4 // indirect github.com/go-jose/go-jose/v4 v4.1.2 // indirect github.com/go-logr/logr v1.4.3 // indirect - github.com/gobwas/glob v0.2.3 // indirect - github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20250630185457-6e76a2b096b5 // indirect - github.com/kennygrant/sanitize v1.2.4 // indirect github.com/labstack/gommon v0.4.2 github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect - github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect - github.com/temoto/robotstxt v1.1.2 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect github.com/valyala/fasttemplate v1.2.2 // indirect golang.org/x/crypto v0.41.0 // indirect @@ -56,7 +43,6 @@ require ( golang.org/x/text v0.28.0 // indirect golang.org/x/time v0.12.0 // indirect golang.org/x/tools v0.35.0 // indirect - google.golang.org/appengine v1.6.8 // indirect google.golang.org/protobuf v1.36.7 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 139f0bcd..5b11c75f 100644 --- a/go.sum +++ b/go.sum @@ -1,18 +1,5 @@ github.com/AlexEidt/Vidio v1.5.1 h1:tovwvtgQagUz1vifiL9OeWkg1fP/XUzFazFKh7tFtaE= github.com/AlexEidt/Vidio v1.5.1/go.mod h1:djhIMnWMqPrC3X6nB6ymGX6uWWlgw+VayYGKE1bNwmI= -github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= -github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= -github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= -github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= -github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ= -github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM= -github.com/antchfx/xmlquery v1.4.4 h1:mxMEkdYP3pjKSftxss4nUHfjBhnMk4imGoR96FRY2dg= -github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fusrx9b12fc= -github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/antchfx/xpath v1.3.4 h1:1ixrW1VnXd4HurCj7qnqnR0jo14g8JMe20Fshg1Vgz4= -github.com/antchfx/xpath v1.3.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= -github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -24,18 +11,6 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= -github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= -github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= -github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= -github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= -github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -45,8 +20,6 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= -github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -57,16 +30,6 @@ github.com/labstack/echo/v4 v4.13.4 h1:oTZZW+T3s9gAu5L8vmzihV7/lkXGZuITzTQkTEhcX github.com/labstack/echo/v4 v4.13.4/go.mod h1:g63b33BZ5vZzcIUF8AtRH40DrTlXnx4UMC8rBdndmjQ= github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= -github.com/masa-finance/tee-types v1.1.13 h1:bVXUEF8nXT3bhJE4kcDwcuzfQopid9BbIp0/OucClL4= -github.com/masa-finance/tee-types v1.1.13/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06 h1:VPJyZ5M55OjEObOyQq330xBLJ8eyHSfDAQaA7ZC9vec= -github.com/masa-finance/tee-types v1.1.14-0.20250905184213-79aa76bbab06/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c h1:W0/19aE993pG31cqBzdxvJ5WwNmjtWs8nLC6ZTvtLO4= -github.com/masa-finance/tee-types v1.1.14-0.20250909185444-70e19b68717c/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4 h1:KmBzY5J3nf7bbvFNS/f2U0hGTZcwBrwsBahVXfj9uOI= -github.com/masa-finance/tee-types v1.1.14-0.20250909194949-898e896b17a4/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8 h1:HxcGeqrBkTPP3UAQ9xO1yeYdFWdP1qAkuWzeyyz+Rj4= -github.com/masa-finance/tee-types v1.1.14-0.20250909201803-1a8ae3d979e8/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191 h1:zzd94bBcRhfUUqhzcgRDOUFwgJRlVR5rTUqAaocjfw4= github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= @@ -83,17 +46,12 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= -github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= -github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo= @@ -107,7 +65,6 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= -golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= @@ -126,7 +83,6 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -136,7 +92,6 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -150,7 +105,6 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= @@ -162,18 +116,15 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= -golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= @@ -187,11 +138,6 @@ golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxb golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 025c75c6ada247ff69546500eb9afc09665bd0f9 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 10 Sep 2025 17:10:33 +0200 Subject: [PATCH 18/43] chore: improve web test --- internal/jobs/web_test.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 4299dabb..d1ee6ac5 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -182,9 +182,9 @@ var _ = Describe("WebScraper", func() { } }) - It("should execute a real web scraping job when APIFY_API_KEY is set", func() { - if apifyKey == "" { - Skip("APIFY_API_KEY is not set") + It("should execute a real web scraping job when keys is set", func() { + if apifyKey == "" || geminiKey == "" { + Skip("APIFY_API_KEY and GEMINI_API_KEY required for integration web integration tests") } cfg := config.JobConfiguration{ @@ -199,7 +199,7 @@ var _ = Describe("WebScraper", func() { Type: teetypes.WebJob, Arguments: map[string]any{ "type": teetypes.WebScraper, - "url": "https://example.com", + "url": "https://example.com/", "max_depth": 0, "max_pages": 1, }, @@ -216,6 +216,7 @@ var _ = Describe("WebScraper", func() { Expect(resp).NotTo(BeEmpty()) Expect(resp[0]).NotTo(BeNil()) Expect(resp[0].URL).To(Equal("https://example.com/")) + Expect(resp[0].LLMResponse).NotTo(BeEmpty()) }) It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { From 50a937dc643234a7debfbf9972df84cb75117241 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 12 Sep 2025 20:16:28 +0200 Subject: [PATCH 19/43] feat: without tiktok lang preference, attempt default, then attempt first transcription --- internal/jobs/tiktok.go | 52 +++++++++++++++++++++--------------- internal/jobs/tiktok_test.go | 1 - 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index e702426a..2eff0820 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -152,14 +152,6 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo return types.JobResult{Error: "VideoURL is required"}, fmt.Errorf("videoURL is required") } - // Use the enhanced language selection logic - selectedLanguageKey := tiktokArgs.GetLanguageCode() // This handles defaults automatically - if tiktokArgs.HasLanguagePreference() { - logrus.WithField("job_uuid", j.UUID).Infof("Using custom language preference: %s", selectedLanguageKey) - } else { - logrus.WithField("job_uuid", j.UUID).Infof("Using default language: %s", selectedLanguageKey) - } - // Sub-Step 3.1: Call TikTok Transcription API apiRequestBody := map[string]string{"url": tiktokArgs.GetVideoURL()} jsonBody, err := json.Marshal(apiRequestBody) @@ -229,28 +221,44 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo } vttText := "" + languageCode := tiktokArgs.GetLanguageCode() // either requested or default - // Directly use the requested/default language; if missing, return an error - if transcript, ok := parsedAPIResponse.Transcripts[selectedLanguageKey]; ok && strings.TrimSpace(transcript) != "" { - vttText = transcript + if tiktokArgs.HasLanguagePreference() { + if transcript, ok := parsedAPIResponse.Transcripts[tiktokArgs.Language]; ok && strings.TrimSpace(transcript) != "" { + vttText = transcript + } } else { - errMsg := fmt.Sprintf("Transcript for requested language %s not found in API response", selectedLanguageKey) - logrus.WithFields(logrus.Fields{ - "job_uuid": j.UUID, - "requested_lang": selectedLanguageKey, - }).Error(errMsg) - ttt.stats.Add(j.WorkerID, stats.TikTokTranscriptionErrors, 1) - return types.JobResult{Error: errMsg}, fmt.Errorf(errMsg) + // Attempt to use the default language + if transcript, ok := parsedAPIResponse.Transcripts[languageCode]; ok && strings.TrimSpace(transcript) != "" { + vttText = transcript + } else { + // No preference and default not found - return the first available transcript + for langCode, transcript := range parsedAPIResponse.Transcripts { + if strings.TrimSpace(transcript) != "" { + vttText = transcript + languageCode = langCode + break + } + } + } } if vttText == "" { - errMsg := "Suitable transcript could not be extracted from API response" - logrus.WithField("job_uuid", j.UUID).Error(errMsg) + errMsg := "" + if tiktokArgs.HasLanguagePreference() { + errMsg = fmt.Sprintf("Transcript for requested language %s not found in API response", languageCode) + } else { + errMsg = fmt.Sprintf("No transcripts found in API response") + } + logrus.WithFields(logrus.Fields{ + "job_uuid": j.UUID, + "requested_lang": languageCode, + }).Error(errMsg) ttt.stats.Add(j.WorkerID, stats.TikTokTranscriptionErrors, 1) return types.JobResult{Error: errMsg}, fmt.Errorf(errMsg) } - logrus.Debugf("Job %s: Raw VTT content for language %s:\n%s", j.UUID, selectedLanguageKey, vttText) + logrus.Debugf("Job %s: Raw VTT content for language %s:\n%s", j.UUID, languageCode, vttText) // Convert VTT to Plain Text plainTextTranscription, err := convertVTTToPlainText(vttText) @@ -265,7 +273,7 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo // Process Result & Return resultData := teetypes.TikTokTranscriptionResult{ TranscriptionText: plainTextTranscription, - DetectedLanguage: selectedLanguageKey, + DetectedLanguage: languageCode, VideoTitle: parsedAPIResponse.VideoTitle, OriginalURL: tiktokArgs.GetVideoURL(), ThumbnailURL: parsedAPIResponse.ThumbnailURL, diff --git a/internal/jobs/tiktok_test.go b/internal/jobs/tiktok_test.go index 016dfd02..cb8973c0 100644 --- a/internal/jobs/tiktok_test.go +++ b/internal/jobs/tiktok_test.go @@ -48,7 +48,6 @@ var _ = Describe("TikTok", func() { jobArguments := map[string]interface{}{ "type": teetypes.CapTranscription, "video_url": videoURL, - // default language is eng-US from tee types } job := types.Job{ From ae3388c43bfa71279abac2ce295e9c1459a64324 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 12 Sep 2025 20:49:18 +0200 Subject: [PATCH 20/43] fix: config refactor and variable naming in web apify client --- internal/config/config.go | 14 +++++++--- internal/jobs/llmapify/client.go | 38 +++++++++++++-------------- internal/jobs/llmapify/client_test.go | 4 +-- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index c8292dd4..483e35bc 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -303,18 +303,24 @@ func (jc JobConfiguration) GetRedditConfig() RedditConfig { } } +type LlmConfig struct { + GeminiApiKey string +} + // WebConfig represents the configuration needed for Web scraping via Apify type WebConfig struct { - ApifyApiKey string - GeminiApiKey string + LlmConfig + ApifyApiKey string } // GetWebConfig constructs a WebConfig directly from the JobConfiguration // This eliminates the need for JSON marshaling/unmarshaling func (jc JobConfiguration) GetWebConfig() WebConfig { return WebConfig{ - ApifyApiKey: jc.GetString("apify_api_key", ""), - GeminiApiKey: jc.GetString("gemini_api_key", ""), + LlmConfig: LlmConfig{ + GeminiApiKey: jc.GetString("gemini_api_key", ""), + }, + ApifyApiKey: jc.GetString("apify_api_key", ""), } } diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index 9a793780..ec346a59 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -13,18 +13,18 @@ import ( ) const ( - LLMActorID = "dusan.vystrcil~llm-dataset-processor" + ActorID = "dusan.vystrcil~llm-dataset-processor" ) var ( - ErrLlmProviderKeyRequired = errors.New("llm provider key is required") - ErrFailedToCreateApifyClient = errors.New("failed to create apify client") + ErrProviderKeyRequired = errors.New("llm provider key is required") + ErrFailedToCreateClient = errors.New("failed to create apify client") ) -type LLMApifyClient struct { - apifyClient client.Apify +type ApifyClient struct { + client client.Apify statsCollector *stats.StatsCollector - llmProviderKey string + providerKey string } // NewInternalClient is a function variable that can be replaced in tests. @@ -34,38 +34,38 @@ var NewInternalClient = func(apiKey string) (client.Apify, error) { } // NewClient creates a new LLM Apify client -func NewClient(apiToken string, llmProviderKey string, statsCollector *stats.StatsCollector) (*LLMApifyClient, error) { - apifyClient, err := NewInternalClient(apiToken) +func NewClient(apiToken string, providerKey string, statsCollector *stats.StatsCollector) (*ApifyClient, error) { + client, err := NewInternalClient(apiToken) if err != nil { - return nil, fmt.Errorf("%w: %v", ErrFailedToCreateApifyClient, err) + return nil, fmt.Errorf("%w: %v", ErrFailedToCreateClient, err) } - if llmProviderKey == "" { - return nil, ErrLlmProviderKeyRequired + if providerKey == "" { + return nil, ErrProviderKeyRequired } - return &LLMApifyClient{ - apifyClient: apifyClient, + return &ApifyClient{ + client: client, statsCollector: statsCollector, - llmProviderKey: llmProviderKey, + providerKey: providerKey, }, nil } // ValidateApiKey tests if the Apify API token is valid -func (c *LLMApifyClient) ValidateApiKey() error { - return c.apifyClient.ValidateApiKey() +func (c *ApifyClient) ValidateApiKey() error { + return c.client.ValidateApiKey() } -func (c *LLMApifyClient) Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { +func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.LLMQueries, 1) } input := args.ToLLMProcessorRequest() - input.LLMProviderApiKey = c.llmProviderKey + input.LLMProviderApiKey = c.providerKey limit := uint(1) // TODO, verify you can only ever operate on one dataset at a time - dataset, nextCursor, err := c.apifyClient.RunActorAndGetResponse(LLMActorID, input, cursor, limit) + dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) if err != nil { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.LLMErrors, 1) diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 0d960f19..ca450323 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -39,7 +39,7 @@ func (m *MockApifyClient) ValidateApiKey() error { var _ = Describe("LLMApifyClient", func() { var ( mockClient *MockApifyClient - llmClient *llmapify.LLMApifyClient + llmClient *llmapify.ApifyClient apifyKey string ) @@ -69,7 +69,7 @@ var _ = Describe("LLMApifyClient", func() { Expect(err).ToNot(HaveOccurred()) mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { - Expect(actorID).To(Equal(llmapify.LLMActorID)) + Expect(actorID).To(Equal(llmapify.ActorID)) Expect(limit).To(Equal(uint(1))) // Verify the input is correctly converted to LLMProcessorRequest From 72fed289a3b469b5bd5c4ad886ac420216ec00dd Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 12 Sep 2025 20:58:24 +0200 Subject: [PATCH 21/43] fix: use llm config in tests and clients --- internal/jobs/llmapify/client.go | 11 ++++++----- internal/jobs/llmapify/client_test.go | 7 ++++--- internal/jobs/web.go | 6 +++--- internal/jobs/web_test.go | 6 +++--- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index ec346a59..060f4508 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -7,6 +7,7 @@ import ( teeargs "github.com/masa-finance/tee-types/args" teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" "github.com/sirupsen/logrus" @@ -24,7 +25,7 @@ var ( type ApifyClient struct { client client.Apify statsCollector *stats.StatsCollector - providerKey string + llmConfig config.LlmConfig } // NewInternalClient is a function variable that can be replaced in tests. @@ -34,20 +35,20 @@ var NewInternalClient = func(apiKey string) (client.Apify, error) { } // NewClient creates a new LLM Apify client -func NewClient(apiToken string, providerKey string, statsCollector *stats.StatsCollector) (*ApifyClient, error) { +func NewClient(apiToken string, llmConfig config.LlmConfig, statsCollector *stats.StatsCollector) (*ApifyClient, error) { client, err := NewInternalClient(apiToken) if err != nil { return nil, fmt.Errorf("%w: %v", ErrFailedToCreateClient, err) } - if providerKey == "" { + if llmConfig.GeminiApiKey == "" { return nil, ErrProviderKeyRequired } return &ApifyClient{ client: client, statsCollector: statsCollector, - providerKey: providerKey, + llmConfig: llmConfig, }, nil } @@ -62,7 +63,7 @@ func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArgument } input := args.ToLLMProcessorRequest() - input.LLMProviderApiKey = c.providerKey + input.LLMProviderApiKey = c.llmConfig.GeminiApiKey limit := uint(1) // TODO, verify you can only ever operate on one dataset at a time dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index ca450323..e7f47c64 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -9,6 +9,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/llmapify" "github.com/masa-finance/tee-worker/pkg/client" @@ -51,7 +52,7 @@ var _ = Describe("LLMApifyClient", func() { return mockClient, nil } var err error - llmClient, err = llmapify.NewClient("test-token", "test-llm-key", nil) + llmClient, err = llmapify.NewClient("test-token", config.LlmConfig{GeminiApiKey: "test-llm-key"}, nil) Expect(err).NotTo(HaveOccurred()) }) @@ -228,7 +229,7 @@ var _ = Describe("LLMApifyClient", func() { return client.NewApifyClient(apiKey) } - realClient, err := llmapify.NewClient(apifyKey, geminiKey, nil) + realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: geminiKey}, nil) Expect(err).NotTo(HaveOccurred()) Expect(realClient.ValidateApiKey()).To(Succeed()) }) @@ -244,7 +245,7 @@ var _ = Describe("LLMApifyClient", func() { return client.NewApifyClient(apiKey) } - realClient, err := llmapify.NewClient(apifyKey, geminiKey, nil) + realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: geminiKey}, nil) Expect(err).NotTo(HaveOccurred()) args := teeargs.LLMProcessorArguments{ diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 13be96d1..d83cb8a7 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -36,8 +36,8 @@ type LLMApify interface { } // NewLLMApifyClient is a function variable to allow injection in tests -var NewLLMApifyClient = func(apiKey string, llmKey string, statsCollector *stats.StatsCollector) (LLMApify, error) { - return llmapify.NewClient(apiKey, llmKey, statsCollector) +var NewLLMApifyClient = func(apiKey string, llmConfig config.LlmConfig, statsCollector *stats.StatsCollector) (LLMApify, error) { + return llmapify.NewClient(apiKey, llmConfig, statsCollector) } type WebScraper struct { @@ -92,7 +92,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: "missing dataset id from web scraping"}, errors.New("missing dataset id from web scraping") } - llmClient, err := NewLLMApifyClient(w.configuration.ApifyApiKey, w.configuration.GeminiApiKey, w.statsCollector) + llmClient, err := NewLLMApifyClient(w.configuration.ApifyApiKey, w.configuration.LlmConfig, w.statsCollector) if err != nil { return types.JobResult{Error: "error creating LLM Apify client"}, fmt.Errorf("failed to create LLM Apify client: %w", err) } diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index d1ee6ac5..925bb01d 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -78,7 +78,7 @@ var _ = Describe("WebScraper", func() { jobs.NewWebApifyClient = func(apiKey string, _ *stats.StatsCollector) (jobs.WebApifyClient, error) { return mockClient, nil } - jobs.NewLLMApifyClient = func(apiKey string, llmKey string, _ *stats.StatsCollector) (jobs.LLMApify, error) { + jobs.NewLLMApifyClient = func(apiKey string, llmConfig config.LlmConfig, _ *stats.StatsCollector) (jobs.LLMApify, error) { return mockLLM, nil } @@ -177,8 +177,8 @@ var _ = Describe("WebScraper", func() { jobs.NewWebApifyClient = func(apiKey string, s *stats.StatsCollector) (jobs.WebApifyClient, error) { return webapify.NewClient(apiKey, s) } - jobs.NewLLMApifyClient = func(apiKey string, llmKey string, s *stats.StatsCollector) (jobs.LLMApify, error) { - return llmapify.NewClient(apiKey, llmKey, s) + jobs.NewLLMApifyClient = func(apiKey string, llmConfig config.LlmConfig, s *stats.StatsCollector) (jobs.LLMApify, error) { + return llmapify.NewClient(apiKey, llmConfig, s) } }) From bf4399e83542e454ab1f4611d76b2b55da18daff Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 12 Sep 2025 21:00:36 +0200 Subject: [PATCH 22/43] chore: remove web var names from web client --- internal/jobs/webapify/client.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index c40041da..fef37202 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -12,11 +12,11 @@ import ( ) const ( - WebActorID = "apify~website-content-crawler" + ActorID = "apify~website-content-crawler" ) -type WebApifyClient struct { - apifyClient client.Apify +type ApifyClient struct { + client client.Apify statsCollector *stats.StatsCollector } @@ -27,24 +27,24 @@ var NewInternalClient = func(apiKey string) (client.Apify, error) { } // NewClient creates a new Reddit Apify client -func NewClient(apiToken string, statsCollector *stats.StatsCollector) (*WebApifyClient, error) { - apifyClient, err := NewInternalClient(apiToken) +func NewClient(apiToken string, statsCollector *stats.StatsCollector) (*ApifyClient, error) { + client, err := NewInternalClient(apiToken) if err != nil { return nil, fmt.Errorf("failed to create apify client: %w", err) } - return &WebApifyClient{ - apifyClient: apifyClient, + return &ApifyClient{ + client: client, statsCollector: statsCollector, }, nil } // ValidateApiKey tests if the Apify API token is valid -func (c *WebApifyClient) ValidateApiKey() error { - return c.apifyClient.ValidateApiKey() +func (c *ApifyClient) ValidateApiKey() error { + return c.client.ValidateApiKey() } -func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { +func (c *ApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.WebQueries, 1) } @@ -54,7 +54,7 @@ func (c *WebApifyClient) Scrape(workerID string, args teeargs.WebArguments, curs // TODO: limit could be greater than max pages if max depth is greater than 0? // TODO: need to test this more thoroughly with various request types limit := uint(args.MaxPages) - dataset, nextCursor, err := c.apifyClient.RunActorAndGetResponse(WebActorID, input, cursor, limit) + dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) if err != nil { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.WebErrors, 1) From 1c894452b444cd4e86109731edafa1d64a3bef7f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 12 Sep 2025 21:01:07 +0200 Subject: [PATCH 23/43] chore: fix web tests --- internal/jobs/webapify/client_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index e585fa89..b377ae53 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -37,7 +37,7 @@ func (m *MockApifyClient) ValidateApiKey() error { var _ = Describe("WebApifyClient", func() { var ( mockClient *MockApifyClient - webClient *webapify.WebApifyClient + webClient *webapify.ApifyClient apifyKey string geminiKey string ) @@ -64,7 +64,7 @@ var _ = Describe("WebApifyClient", func() { } mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { - Expect(actorID).To(Equal(webapify.WebActorID)) + Expect(actorID).To(Equal(webapify.ActorID)) Expect(limit).To(Equal(uint(2))) return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } From c7ac898382c736f7b81fd7d0a034eac2ccc7a689 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 12 Sep 2025 21:12:12 +0200 Subject: [PATCH 24/43] chore: add validation method to llmapikey --- internal/capabilities/detector.go | 11 +---------- internal/config/config.go | 18 ++++++++++++++++-- internal/jobs/llmapify/client.go | 4 ++-- internal/jobs/llmapify/client_test.go | 4 ++-- internal/jobs/web.go | 4 ++-- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index c2e1583f..d5c310e3 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -43,7 +43,7 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface hasAccounts := len(accounts) > 0 hasApiKeys := len(apiKeys) > 0 hasApifyKey := hasValidApifyKey(apifyApiKey) - hasLLMKey := hasValidLLMKey(geminiApiKey) + hasLLMKey := config.LlmApiKey(geminiApiKey).IsValid() // Add Twitter-specific capabilities based on available authentication if hasAccounts { @@ -162,12 +162,3 @@ func hasValidApifyKey(apifyApiKey string) bool { logrus.Infof("Apify API key validated successfully during capability detection") return true } - -func hasValidLLMKey(llmApiKey string) bool { - if llmApiKey == "" { - return false - } - - // TODO validate the gemini key with a handler - return true -} diff --git a/internal/config/config.go b/internal/config/config.go index 483e35bc..bfe05915 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -303,8 +303,22 @@ func (jc JobConfiguration) GetRedditConfig() RedditConfig { } } +// LlmApiKey represents an LLM API key with validation capabilities +type LlmApiKey string + +// IsValid checks if the LLM API key is valid +func (k LlmApiKey) IsValid() bool { + if k == "" { + return false + } + + // TODO: Add actual Gemini API key validation with a handler + // For now, just check if it's not empty + return true +} + type LlmConfig struct { - GeminiApiKey string + GeminiApiKey LlmApiKey } // WebConfig represents the configuration needed for Web scraping via Apify @@ -318,7 +332,7 @@ type WebConfig struct { func (jc JobConfiguration) GetWebConfig() WebConfig { return WebConfig{ LlmConfig: LlmConfig{ - GeminiApiKey: jc.GetString("gemini_api_key", ""), + GeminiApiKey: LlmApiKey(jc.GetString("gemini_api_key", "")), }, ApifyApiKey: jc.GetString("apify_api_key", ""), } diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index 060f4508..961e8a03 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -41,7 +41,7 @@ func NewClient(apiToken string, llmConfig config.LlmConfig, statsCollector *stat return nil, fmt.Errorf("%w: %v", ErrFailedToCreateClient, err) } - if llmConfig.GeminiApiKey == "" { + if !llmConfig.GeminiApiKey.IsValid() { return nil, ErrProviderKeyRequired } @@ -63,7 +63,7 @@ func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArgument } input := args.ToLLMProcessorRequest() - input.LLMProviderApiKey = c.llmConfig.GeminiApiKey + input.LLMProviderApiKey = string(c.llmConfig.GeminiApiKey) limit := uint(1) // TODO, verify you can only ever operate on one dataset at a time dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index e7f47c64..1b2072aa 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -229,7 +229,7 @@ var _ = Describe("LLMApifyClient", func() { return client.NewApifyClient(apiKey) } - realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: geminiKey}, nil) + realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: config.LlmApiKey(geminiKey)}, nil) Expect(err).NotTo(HaveOccurred()) Expect(realClient.ValidateApiKey()).To(Succeed()) }) @@ -245,7 +245,7 @@ var _ = Describe("LLMApifyClient", func() { return client.NewApifyClient(apiKey) } - realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: geminiKey}, nil) + realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: config.LlmApiKey(geminiKey)}, nil) Expect(err).NotTo(HaveOccurred()) args := teeargs.LLMProcessorArguments{ diff --git a/internal/jobs/web.go b/internal/jobs/web.go index d83cb8a7..f903dd4e 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -60,7 +60,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for Web scrape") // Require Gemini key for LLM processing in Web flow - if w.configuration.GeminiApiKey == "" { + if !w.configuration.GeminiApiKey.IsValid() { msg := errors.New("Gemini API key is required for Web job") return types.JobResult{Error: msg.Error()}, msg } @@ -139,7 +139,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { func (ws *WebScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { capabilities := make(teetypes.WorkerCapabilities) - if ws.configuration.ApifyApiKey != "" && ws.configuration.GeminiApiKey != "" { + if ws.configuration.ApifyApiKey != "" && ws.configuration.GeminiApiKey.IsValid() { capabilities[teetypes.WebJob] = teetypes.WebCaps } From 3ff79bd2d878033fcd542ad6b877ff7349aef272 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sat, 13 Sep 2025 00:33:06 +0200 Subject: [PATCH 25/43] fix: update tee types to 1.1.14 --- go.mod | 3 +-- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 9f6506d1..5b8370d2 100644 --- a/go.mod +++ b/go.mod @@ -11,8 +11,7 @@ require ( github.com/joho/godotenv v1.5.1 github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 - // FIXME: replace when new version is released - github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191 + github.com/masa-finance/tee-types v1.1.14 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 5b11c75f..20ce5f92 100644 --- a/go.sum +++ b/go.sum @@ -30,8 +30,8 @@ github.com/labstack/echo/v4 v4.13.4 h1:oTZZW+T3s9gAu5L8vmzihV7/lkXGZuITzTQkTEhcX github.com/labstack/echo/v4 v4.13.4/go.mod h1:g63b33BZ5vZzcIUF8AtRH40DrTlXnx4UMC8rBdndmjQ= github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= -github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191 h1:zzd94bBcRhfUUqhzcgRDOUFwgJRlVR5rTUqAaocjfw4= -github.com/masa-finance/tee-types v1.1.14-0.20250909203048-4de7bc7ab191/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14 h1:g6tqU1JQx8SSPZNVmkhGAViDIcbrzGsGk77hbrx59GY= +github.com/masa-finance/tee-types v1.1.14/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= From 88cce6f3e2afbb0b2917578b867fe4fe88f6621f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 22:27:34 +0200 Subject: [PATCH 26/43] chore: casts llmapikey sooner --- internal/capabilities/detector.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index d5c310e3..27a4cd14 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -38,12 +38,12 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface accounts := jc.GetStringSlice("twitter_accounts", nil) apiKeys := jc.GetStringSlice("twitter_api_keys", nil) apifyApiKey := jc.GetString("apify_api_key", "") - geminiApiKey := jc.GetString("gemini_api_key", "") + geminiApiKey := config.LlmApiKey(jc.GetString("gemini_api_key", "")) hasAccounts := len(accounts) > 0 hasApiKeys := len(apiKeys) > 0 hasApifyKey := hasValidApifyKey(apifyApiKey) - hasLLMKey := config.LlmApiKey(geminiApiKey).IsValid() + hasLLMKey := geminiApiKey.IsValid() // Add Twitter-specific capabilities based on available authentication if hasAccounts { From 3b6d02aa5e98f729e1634ca6433aac0e50f6f382 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 22:42:17 +0200 Subject: [PATCH 27/43] chore: better if/else --- internal/jobs/tiktok.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index 2eff0820..fff685c3 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -244,11 +244,9 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo } if vttText == "" { - errMsg := "" + errMsg := "No transcripts found in API response" if tiktokArgs.HasLanguagePreference() { errMsg = fmt.Sprintf("Transcript for requested language %s not found in API response", languageCode) - } else { - errMsg = fmt.Sprintf("No transcripts found in API response") } logrus.WithFields(logrus.Fields{ "job_uuid": j.UUID, From e43c635819ddc70a4b79da0c14116582a6086da2 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 22:46:06 +0200 Subject: [PATCH 28/43] fix: rename variable --- internal/jobs/web.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/internal/jobs/web.go b/internal/jobs/web.go index f903dd4e..39fa44bf 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -15,6 +15,7 @@ import ( "github.com/masa-finance/tee-worker/pkg/client" teeargs "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/pkg/util" teetypes "github.com/masa-finance/tee-types/types" ) @@ -82,7 +83,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: "error while scraping Web"}, fmt.Errorf("error creating Web Apify client: %w", err) } - resp, datasetId, cursor, err := webClient.Scrape(j.WorkerID, *webArgs, client.EmptyCursor) + webResp, datasetId, cursor, err := webClient.Scrape(j.WorkerID, *webArgs, client.EmptyCursor) if err != nil { return types.JobResult{Error: fmt.Sprintf("error while scraping Web: %s", err.Error())}, fmt.Errorf("error scraping Web: %w", err) } @@ -108,17 +109,14 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: fmt.Sprintf("error while processing LLM: %s", llmErr.Error())}, fmt.Errorf("error processing LLM: %w", llmErr) } - max := len(resp) - if len(llmResp) < max { - max = len(llmResp) - } + max := util.Min(len(webResp), len(llmResp)) for i := 0; i < max; i++ { - if resp[i] != nil { - resp[i].LLMResponse = llmResp[i].LLMResponse + if webResp[i] != nil { + webResp[i].LLMResponse = llmResp[i].LLMResponse } } - data, err := json.Marshal(resp) + data, err := json.Marshal(webResp) if err != nil { return types.JobResult{Error: fmt.Sprintf("error marshalling Web response")}, fmt.Errorf("error marshalling Web response: %w", err) } From 6c9d5fe137c5a1414f4ad9dd592c4595900835ef Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 22:50:22 +0200 Subject: [PATCH 29/43] chore: move skip --- internal/jobs/web_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 925bb01d..876e4a27 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -173,6 +173,10 @@ var _ = Describe("WebScraper", func() { apifyKey = os.Getenv("APIFY_API_KEY") geminiKey = os.Getenv("GEMINI_API_KEY") + if apifyKey == "" || geminiKey == "" { + Skip("APIFY_API_KEY and GEMINI_API_KEY required for integration web integration tests") + } + // Reset to use real client for integration tests jobs.NewWebApifyClient = func(apiKey string, s *stats.StatsCollector) (jobs.WebApifyClient, error) { return webapify.NewClient(apiKey, s) @@ -183,10 +187,6 @@ var _ = Describe("WebScraper", func() { }) It("should execute a real web scraping job when keys is set", func() { - if apifyKey == "" || geminiKey == "" { - Skip("APIFY_API_KEY and GEMINI_API_KEY required for integration web integration tests") - } - cfg := config.JobConfiguration{ "apify_api_key": apifyKey, "gemini_api_key": geminiKey, From 65c5c18505863837a19ae042f6d4aebcb215e327 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 23:01:01 +0200 Subject: [PATCH 30/43] chore: fix test to test multiple pages --- internal/jobs/web_test.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 876e4a27..4f9e669b 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -3,6 +3,7 @@ package jobs_test import ( "encoding/json" "errors" + "fmt" "os" . "github.com/onsi/ginkgo/v2" @@ -186,7 +187,7 @@ var _ = Describe("WebScraper", func() { } }) - It("should execute a real web scraping job when keys is set", func() { + FIt("should execute a real web scraping job when keys is set", func() { cfg := config.JobConfiguration{ "apify_api_key": apifyKey, "gemini_api_key": geminiKey, @@ -199,9 +200,9 @@ var _ = Describe("WebScraper", func() { Type: teetypes.WebJob, Arguments: map[string]any{ "type": teetypes.WebScraper, - "url": "https://example.com/", - "max_depth": 0, - "max_pages": 1, + "url": "https://docs.learnbittensor.org", + "max_depth": 1, + "max_pages": 3, }, } @@ -212,11 +213,18 @@ var _ = Describe("WebScraper", func() { var resp []*teetypes.WebScraperResult err = json.Unmarshal(result.Data, &resp) + Expect(err).NotTo(HaveOccurred()) - Expect(resp).NotTo(BeEmpty()) - Expect(resp[0]).NotTo(BeNil()) - Expect(resp[0].URL).To(Equal("https://example.com/")) - Expect(resp[0].LLMResponse).NotTo(BeEmpty()) + Expect(resp).To(HaveLen(3)) + + for i := 0; i < 3; i++ { + Expect(resp[i]).NotTo(BeNil()) + Expect(resp[i].URL).To(ContainSubstring("https://docs.learnbittensor.org/")) + Expect(resp[i].LLMResponse).NotTo(BeEmpty()) + Expect(resp[i].Markdown).NotTo(BeEmpty()) + Expect(resp[i].Text).NotTo(BeEmpty()) + fmt.Println(resp[i].LLMResponse) + } }) It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { From c9878914b912ebcd37a23d715f85439ea3eacaba Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 23:29:25 +0200 Subject: [PATCH 31/43] fix: support max pages in llm args and client --- go.mod | 3 ++- go.sum | 4 ++++ internal/jobs/web.go | 3 ++- internal/jobs/web_test.go | 11 +++++++---- internal/jobs/webapify/client.go | 2 -- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 5b8370d2..c2ab048a 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,8 @@ require ( github.com/joho/godotenv v1.5.1 github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 - github.com/masa-finance/tee-types v1.1.14 + // FIXME: update once we have a new release + github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 20ce5f92..eecf6933 100644 --- a/go.sum +++ b/go.sum @@ -30,6 +30,10 @@ github.com/labstack/echo/v4 v4.13.4 h1:oTZZW+T3s9gAu5L8vmzihV7/lkXGZuITzTQkTEhcX github.com/labstack/echo/v4 v4.13.4/go.mod h1:g63b33BZ5vZzcIUF8AtRH40DrTlXnx4UMC8rBdndmjQ= github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= +github.com/masa-finance/tee-types v1.1.14-0.20250915212410-68d7a9a7a802 h1:mV6C1lZDGz5MXxJXwyWjlVLoS9Q1JFs0Jd3EmdFniOU= +github.com/masa-finance/tee-types v1.1.14-0.20250915212410-68d7a9a7a802/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6 h1:Wue5Rkl26SkU9t6RnmijHJr2UJxvaTjqiJLMM1HwBQQ= +github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14 h1:g6tqU1JQx8SSPZNVmkhGAViDIcbrzGsGk77hbrx59GY= github.com/masa-finance/tee-types v1.1.14/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 39fa44bf..98d77a48 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -100,9 +100,10 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { llmArgs := teeargs.LLMProcessorArguments{ DatasetId: datasetId, - Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", + Prompt: "summarize the content of this webpage in plain text, focusing on keywords and topics: ${markdown}", MaxTokens: teeargs.LLMDefaultMaxTokens, Temperature: teeargs.LLMDefaultTemperature, + MaxPages: webArgs.MaxPages, } llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) if llmErr != nil { diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 4f9e669b..9a0c3135 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -195,14 +195,17 @@ var _ = Describe("WebScraper", func() { integrationStatsCollector := stats.StartCollector(128, cfg) integrationScraper := jobs.NewWebScraper(cfg, integrationStatsCollector) + maxDepth := 1 + maxPages := 3 + job := types.Job{ UUID: "integration-test-uuid", Type: teetypes.WebJob, Arguments: map[string]any{ "type": teetypes.WebScraper, "url": "https://docs.learnbittensor.org", - "max_depth": 1, - "max_pages": 3, + "max_depth": maxDepth, + "max_pages": maxPages, }, } @@ -217,12 +220,12 @@ var _ = Describe("WebScraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(3)) - for i := 0; i < 3; i++ { + for i := 0; i < maxPages; i++ { Expect(resp[i]).NotTo(BeNil()) Expect(resp[i].URL).To(ContainSubstring("https://docs.learnbittensor.org/")) Expect(resp[i].LLMResponse).NotTo(BeEmpty()) Expect(resp[i].Markdown).NotTo(BeEmpty()) - Expect(resp[i].Text).NotTo(BeEmpty()) + Expect(resp[i].Text).To(ContainSubstring("Bittensor")) fmt.Println(resp[i].LLMResponse) } }) diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index fef37202..601f23e9 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -51,8 +51,6 @@ func (c *ApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor input := args.ToWebScraperRequest() - // TODO: limit could be greater than max pages if max depth is greater than 0? - // TODO: need to test this more thoroughly with various request types limit := uint(args.MaxPages) dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) if err != nil { From 291382daf44d04b3aff1b3bfb0d575d774450471 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 23:32:48 +0200 Subject: [PATCH 32/43] fix: support max pages in llm worker --- internal/jobs/llmapify/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index 961e8a03..d533cdfd 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -65,7 +65,7 @@ func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArgument input := args.ToLLMProcessorRequest() input.LLMProviderApiKey = string(c.llmConfig.GeminiApiKey) - limit := uint(1) // TODO, verify you can only ever operate on one dataset at a time + limit := uint(args.MaxPages) dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) if err != nil { if c.statsCollector != nil { From ab0c93ed91553a4dbba7cbdec1c40746fe991343 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 23:34:56 +0200 Subject: [PATCH 33/43] chore: massage prompt --- internal/jobs/web.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 98d77a48..9be6fbb1 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -100,7 +100,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { llmArgs := teeargs.LLMProcessorArguments{ DatasetId: datasetId, - Prompt: "summarize the content of this webpage in plain text, focusing on keywords and topics: ${markdown}", + Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", MaxTokens: teeargs.LLMDefaultMaxTokens, Temperature: teeargs.LLMDefaultTemperature, MaxPages: webArgs.MaxPages, From 8eb96e570d7950e03a18228a22a0536cd18d3b83 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 15 Sep 2025 23:38:14 +0200 Subject: [PATCH 34/43] fix: remove focused test --- internal/jobs/web_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 9a0c3135..1e04f2a6 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -187,7 +187,7 @@ var _ = Describe("WebScraper", func() { } }) - FIt("should execute a real web scraping job when keys is set", func() { + It("should execute a real web scraping job when keys is set", func() { cfg := config.JobConfiguration{ "apify_api_key": apifyKey, "gemini_api_key": geminiKey, From 7632f4d04a2ab58316b78cf88aaa95b9efb3c513 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 00:18:08 +0200 Subject: [PATCH 35/43] fix: upgrade tee types --- go.mod | 2 +- go.sum | 2 ++ internal/jobs/llmapify/client.go | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index c2ab048a..1ce1e62f 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 // FIXME: update once we have a new release - github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6 + github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index eecf6933..9bc075df 100644 --- a/go.sum +++ b/go.sum @@ -34,6 +34,8 @@ github.com/masa-finance/tee-types v1.1.14-0.20250915212410-68d7a9a7a802 h1:mV6C1 github.com/masa-finance/tee-types v1.1.14-0.20250915212410-68d7a9a7a802/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6 h1:Wue5Rkl26SkU9t6RnmijHJr2UJxvaTjqiJLMM1HwBQQ= github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df h1:3N0bfy4juK5IiLyW0DnYntPjBkZkNKqlm6H2MqmaDoU= +github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14 h1:g6tqU1JQx8SSPZNVmkhGAViDIcbrzGsGk77hbrx59GY= github.com/masa-finance/tee-types v1.1.14/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index d533cdfd..e6d5b977 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -65,7 +65,7 @@ func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArgument input := args.ToLLMProcessorRequest() input.LLMProviderApiKey = string(c.llmConfig.GeminiApiKey) - limit := uint(args.MaxPages) + limit := uint(args.Items) dataset, nextCursor, err := c.client.RunActorAndGetResponse(ActorID, input, cursor, limit) if err != nil { if c.statsCollector != nil { From 755fb627bda8743a2e327f2c3c162fb9403cb6b3 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 00:18:28 +0200 Subject: [PATCH 36/43] fix: web pages to llm items --- internal/jobs/web.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 9be6fbb1..590bcf97 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -103,7 +103,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", MaxTokens: teeargs.LLMDefaultMaxTokens, Temperature: teeargs.LLMDefaultTemperature, - MaxPages: webArgs.MaxPages, + Items: webArgs.MaxPages, } llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) if llmErr != nil { From bfe72aef088935472e4315333def65dcee2608d4 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 00:19:05 +0200 Subject: [PATCH 37/43] fix: remove print ln --- internal/jobs/web_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 1e04f2a6..27dd1804 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -3,7 +3,6 @@ package jobs_test import ( "encoding/json" "errors" - "fmt" "os" . "github.com/onsi/ginkgo/v2" @@ -226,7 +225,6 @@ var _ = Describe("WebScraper", func() { Expect(resp[i].LLMResponse).NotTo(BeEmpty()) Expect(resp[i].Markdown).NotTo(BeEmpty()) Expect(resp[i].Text).To(ContainSubstring("Bittensor")) - fmt.Println(resp[i].LLMResponse) } }) From 7539e7ce1f96f80c1b01daab4cda28d7b4459591 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 00:19:24 +0200 Subject: [PATCH 38/43] fix: clear no sub string --- internal/jobs/web_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index 27dd1804..b24d3b34 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -221,7 +221,7 @@ var _ = Describe("WebScraper", func() { for i := 0; i < maxPages; i++ { Expect(resp[i]).NotTo(BeNil()) - Expect(resp[i].URL).To(ContainSubstring("https://docs.learnbittensor.org/")) + Expect(resp[i].URL).To(ContainSubstring("https://docs.learnbittensor.org")) Expect(resp[i].LLMResponse).NotTo(BeEmpty()) Expect(resp[i].Markdown).NotTo(BeEmpty()) Expect(resp[i].Text).To(ContainSubstring("Bittensor")) From ef6957ef052ea46beb1dbb06bed5bce9eb1cb30d Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 00:23:19 +0200 Subject: [PATCH 39/43] chore: map to web response --- internal/jobs/web.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 590bcf97..53f7b647 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -103,7 +103,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", MaxTokens: teeargs.LLMDefaultMaxTokens, Temperature: teeargs.LLMDefaultTemperature, - Items: webArgs.MaxPages, + Items: len(webResp), } llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) if llmErr != nil { @@ -123,7 +123,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } if w.statsCollector != nil { - w.statsCollector.Add(j.WorkerID, stats.WebProcessedPages, uint(len(llmResp))) + w.statsCollector.Add(j.WorkerID, stats.WebProcessedPages, uint(max)) } return types.JobResult{ From 66501e2d8fc26a6eaa6b4ce81da5de8c3168c668 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 00:24:43 +0200 Subject: [PATCH 40/43] fix: make api test more accepting --- internal/api/api_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index a2e2caab..3af0d603 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -77,8 +77,7 @@ var _ = Describe("API", func() { Type: teetypes.TiktokJob, Arguments: map[string]interface{}{ "type": "transcription", - "video_url": "https://www.tiktok.com/@.jake.ai/video/7516694182245813509", - "language": "eng-US", + "video_url": "https://www.tiktok.com/@theblockrunner.com/video/7227579907361066282", }, } // Step 2: Get a Job signature From 8eb686c03c610ba549c7b1333b1d79f8b7bf6c42 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 20:05:12 +0200 Subject: [PATCH 41/43] chore: update tee types --- go.mod | 2 +- go.sum | 2 ++ internal/jobs/llmapify/client_test.go | 2 +- internal/jobs/web.go | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 1ce1e62f..00346601 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 // FIXME: update once we have a new release - github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df + github.com/masa-finance/tee-types v1.1.14-0.20250916175139-4466953b9926 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 9bc075df..dfd9e53f 100644 --- a/go.sum +++ b/go.sum @@ -36,6 +36,8 @@ github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6 h1:Wue5R github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df h1:3N0bfy4juK5IiLyW0DnYntPjBkZkNKqlm6H2MqmaDoU= github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.14-0.20250916175139-4466953b9926 h1:3GPv8/DY1ILhk0ClvugD2JgC6U30oiGW/uAQ6SQNUe0= +github.com/masa-finance/tee-types v1.1.14-0.20250916175139-4466953b9926/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/tee-types v1.1.14 h1:g6tqU1JQx8SSPZNVmkhGAViDIcbrzGsGk77hbrx59GY= github.com/masa-finance/tee-types v1.1.14/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 1b2072aa..7743c2b8 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -181,7 +181,7 @@ var _ = Describe("LLMApifyClient", func() { DatasetId: "test-dataset-id", Prompt: "test-prompt", MaxTokens: 500, - Temperature: "0.5", + Temperature: 0.5, } mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 53f7b647..75d0d0dc 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -103,7 +103,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", MaxTokens: teeargs.LLMDefaultMaxTokens, Temperature: teeargs.LLMDefaultTemperature, - Items: len(webResp), + Items: uint(len(webResp)), } llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) if llmErr != nil { From 3916a8e686cb2ebcc58e92b981d8a398eacfebb4 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 16 Sep 2025 20:10:09 +0200 Subject: [PATCH 42/43] fix: worker test --- internal/jobs/llmapify/client_test.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 7743c2b8..ed81ae60 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "os" + "strconv" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -78,11 +79,11 @@ var _ = Describe("LLMApifyClient", func() { Expect(ok).To(BeTrue()) Expect(request.InputDatasetId).To(Equal("test-dataset-id")) Expect(request.Prompt).To(Equal("test-prompt")) - Expect(request.LLMProviderApiKey).To(Equal("test-llm-key")) // should be set from constructor - Expect(request.Model).To(Equal(teeargs.LLMDefaultModel)) // default model - Expect(request.MultipleColumns).To(Equal(teeargs.LLMDefaultMultipleColumns)) // default value - Expect(request.MaxTokens).To(Equal(teeargs.LLMDefaultMaxTokens)) // default value - Expect(request.Temperature).To(Equal(teeargs.LLMDefaultTemperature)) // default value + Expect(request.LLMProviderApiKey).To(Equal("test-llm-key")) // should be set from constructor + Expect(request.Model).To(Equal(teeargs.LLMDefaultModel)) // default model + Expect(request.MultipleColumns).To(Equal(teeargs.LLMDefaultMultipleColumns)) // default value + Expect(request.MaxTokens).To(Equal(teeargs.LLMDefaultMaxTokens)) // default value + Expect(request.Temperature).To(Equal(strconv.FormatFloat(teeargs.LLMDefaultTemperature, 'f', -1, 64))) // default value return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } @@ -187,7 +188,7 @@ var _ = Describe("LLMApifyClient", func() { mockClient.RunActorAndGetResponseFunc = func(actorID string, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { request, ok := input.(teetypes.LLMProcessorRequest) Expect(ok).To(BeTrue()) - Expect(request.MaxTokens).To(Equal(500)) + Expect(request.MaxTokens).To(Equal(uint(500))) Expect(request.Temperature).To(Equal("0.5")) Expect(request.LLMProviderApiKey).To(Equal("test-llm-key")) // should be set from constructor From ddd3f2f0c0b4f8a082bbdbc2f487c129ffc0e19d Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 17 Sep 2025 19:11:26 +0200 Subject: [PATCH 43/43] chore: bump tee types --- go.mod | 3 +-- go.sum | 12 ++---------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 00346601..8aef56f6 100644 --- a/go.mod +++ b/go.mod @@ -11,8 +11,7 @@ require ( github.com/joho/godotenv v1.5.1 github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 - // FIXME: update once we have a new release - github.com/masa-finance/tee-types v1.1.14-0.20250916175139-4466953b9926 + github.com/masa-finance/tee-types v1.1.15 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index dfd9e53f..bdb89acb 100644 --- a/go.sum +++ b/go.sum @@ -30,16 +30,8 @@ github.com/labstack/echo/v4 v4.13.4 h1:oTZZW+T3s9gAu5L8vmzihV7/lkXGZuITzTQkTEhcX github.com/labstack/echo/v4 v4.13.4/go.mod h1:g63b33BZ5vZzcIUF8AtRH40DrTlXnx4UMC8rBdndmjQ= github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= -github.com/masa-finance/tee-types v1.1.14-0.20250915212410-68d7a9a7a802 h1:mV6C1lZDGz5MXxJXwyWjlVLoS9Q1JFs0Jd3EmdFniOU= -github.com/masa-finance/tee-types v1.1.14-0.20250915212410-68d7a9a7a802/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6 h1:Wue5Rkl26SkU9t6RnmijHJr2UJxvaTjqiJLMM1HwBQQ= -github.com/masa-finance/tee-types v1.1.14-0.20250915212635-8ad23d0c9eb6/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df h1:3N0bfy4juK5IiLyW0DnYntPjBkZkNKqlm6H2MqmaDoU= -github.com/masa-finance/tee-types v1.1.14-0.20250915221707-1ea688bcd6df/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14-0.20250916175139-4466953b9926 h1:3GPv8/DY1ILhk0ClvugD2JgC6U30oiGW/uAQ6SQNUe0= -github.com/masa-finance/tee-types v1.1.14-0.20250916175139-4466953b9926/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= -github.com/masa-finance/tee-types v1.1.14 h1:g6tqU1JQx8SSPZNVmkhGAViDIcbrzGsGk77hbrx59GY= -github.com/masa-finance/tee-types v1.1.14/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.15 h1:DfTNAYsG5g3XPxzJ2kw1bbT536mOeux3ZxaAq8XnNLg= +github.com/masa-finance/tee-types v1.1.15/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=