From 543f98d6b5897fbd5902370241fdddb1326f8772 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 28 Oct 2025 02:28:10 +0100 Subject: [PATCH 1/5] chore: add residential proxy to tiktok query --- internal/jobs/tiktokapify/client.go | 44 ++++++++++------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/internal/jobs/tiktokapify/client.go b/internal/jobs/tiktokapify/client.go index 46cc02d7..123758d2 100644 --- a/internal/jobs/tiktokapify/client.go +++ b/internal/jobs/tiktokapify/client.go @@ -11,12 +11,17 @@ import ( "github.com/masa-finance/tee-worker/v2/pkg/client" ) +type Proxy struct { + UseApifyProxy bool `json:"useApifyProxy"` + ApifyProxyGroups []string `json:"apifyProxyGroups"` +} + type TikTokSearchByQueryRequest struct { - SearchTerms []string `json:"search"` - StartUrls []string `json:"startUrls"` - MaxItems uint `json:"maxItems"` - EndPage uint `json:"endPage"` - Proxy map[string]any `json:"proxy"` + SearchTerms []string `json:"search"` + StartUrls []string `json:"startUrls"` + MaxItems uint `json:"maxItems"` + EndPage uint `json:"endPage"` + Proxy Proxy `json:"proxy"` } type TikTokSearchByTrendingRequest struct { @@ -61,21 +66,11 @@ func (c *TikTokApifyClient) SearchByQuery(input query.Arguments, cursor client.C StartUrls: startUrls, MaxItems: input.MaxItems, EndPage: input.EndPage, - Proxy: map[string]any{"useApifyProxy": true}, - } - - // Convert struct to map[string]any for Apify client - requestBytes, err := json.Marshal(request) - if err != nil { - return nil, "", fmt.Errorf("failed to marshal request: %w", err) - } - - var apifyInput map[string]any - if err := json.Unmarshal(requestBytes, &apifyInput); err != nil { - return nil, "", fmt.Errorf("failed to unmarshal to map: %w", err) + Proxy: Proxy{UseApifyProxy: true, ApifyProxyGroups: []string{"RESIDENTIAL"}}, } - dataset, next, err := c.apify.RunActorAndGetResponse(apify.ActorIds.TikTokSearchScraper, apifyInput, cursor, limit) + // Pass the typed request directly to the Apify client + dataset, next, err := c.apify.RunActorAndGetResponse(apify.ActorIds.TikTokSearchScraper, request, cursor, limit) if err != nil { return nil, "", fmt.Errorf("apify run (search): %w", err) } @@ -101,17 +96,8 @@ func (c *TikTokApifyClient) SearchByTrending(input trending.Arguments, cursor cl Period: input.Period, } - requestBytes, err := json.Marshal(request) - if err != nil { - return nil, "", fmt.Errorf("failed to marshal request: %w", err) - } - - var apifyInput map[string]any - if err := json.Unmarshal(requestBytes, &apifyInput); err != nil { - return nil, "", fmt.Errorf("failed to unmarshal to map: %w", err) - } - - dataset, next, err := c.apify.RunActorAndGetResponse(apify.ActorIds.TikTokTrendingScraper, apifyInput, cursor, limit) + // Pass the typed request directly to the Apify client + dataset, next, err := c.apify.RunActorAndGetResponse(apify.ActorIds.TikTokTrendingScraper, request, cursor, limit) if err != nil { return nil, "", fmt.Errorf("apify run (trending): %w", err) } From 5f6418a55019f9bb9c380f318b994d63a2e880e2 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 28 Oct 2025 02:46:23 +0100 Subject: [PATCH 2/5] feat: add residential proxies to tiktok query and web --- api/args/web/scraper/scraper.go | 1 + api/types/apify.go | 6 ++++++ api/types/web.go | 1 + internal/jobs/tiktok_test.go | 1 - internal/jobs/tiktokapify/client.go | 17 ++++++----------- 5 files changed, 14 insertions(+), 12 deletions(-) create mode 100644 api/types/apify.go diff --git a/api/args/web/scraper/scraper.go b/api/args/web/scraper/scraper.go index 95ba1655..73cee61f 100644 --- a/api/args/web/scraper/scraper.go +++ b/api/args/web/scraper/scraper.go @@ -94,6 +94,7 @@ func (w Arguments) ToScraperRequest() types.WebScraperRequest { MaxCrawlPages: w.MaxPages, RespectRobotsTxtFile: DefaultRespectRobotsTxtFile, SaveMarkdown: DefaultSaveMarkdown, + Proxy: types.ApifyProxy{UseApifyProxy: true, ApifyProxyGroups: []string{"RESIDENTIAL"}}, } } diff --git a/api/types/apify.go b/api/types/apify.go new file mode 100644 index 00000000..bf2fa7b8 --- /dev/null +++ b/api/types/apify.go @@ -0,0 +1,6 @@ +package types + +type ApifyProxy struct { + UseApifyProxy bool `json:"useApifyProxy"` + ApifyProxyGroups []string `json:"apifyProxyGroups"` +} diff --git a/api/types/web.go b/api/types/web.go index dda1cea0..843b0bdd 100644 --- a/api/types/web.go +++ b/api/types/web.go @@ -23,6 +23,7 @@ type WebScraperRequest struct { MaxCrawlPages int `json:"maxCrawlPages"` RespectRobotsTxtFile bool `json:"respectRobotsTxtFile"` SaveMarkdown bool `json:"saveMarkdown"` + Proxy ApifyProxy `json:"proxyConfiguration"` } // WebCrawlInfo contains information about the crawling process diff --git a/internal/jobs/tiktok_test.go b/internal/jobs/tiktok_test.go index 65321907..c990c1c6 100644 --- a/internal/jobs/tiktok_test.go +++ b/internal/jobs/tiktok_test.go @@ -179,7 +179,6 @@ var _ = Describe("TikTok", func() { "search": []string{"crypto", "ai"}, "max_items": 5, "end_page": 1, - "proxy": map[string]any{"use_apify_proxy": true}, }, WorkerID: "tiktok-test-worker-search-query", Timeout: 60 * time.Second, diff --git a/internal/jobs/tiktokapify/client.go b/internal/jobs/tiktokapify/client.go index 123758d2..ae65d8c3 100644 --- a/internal/jobs/tiktokapify/client.go +++ b/internal/jobs/tiktokapify/client.go @@ -11,17 +11,12 @@ import ( "github.com/masa-finance/tee-worker/v2/pkg/client" ) -type Proxy struct { - UseApifyProxy bool `json:"useApifyProxy"` - ApifyProxyGroups []string `json:"apifyProxyGroups"` -} - type TikTokSearchByQueryRequest struct { - SearchTerms []string `json:"search"` - StartUrls []string `json:"startUrls"` - MaxItems uint `json:"maxItems"` - EndPage uint `json:"endPage"` - Proxy Proxy `json:"proxy"` + SearchTerms []string `json:"search"` + StartUrls []string `json:"startUrls"` + MaxItems uint `json:"maxItems"` + EndPage uint `json:"endPage"` + Proxy types.ApifyProxy `json:"proxy"` } type TikTokSearchByTrendingRequest struct { @@ -66,7 +61,7 @@ func (c *TikTokApifyClient) SearchByQuery(input query.Arguments, cursor client.C StartUrls: startUrls, MaxItems: input.MaxItems, EndPage: input.EndPage, - Proxy: Proxy{UseApifyProxy: true, ApifyProxyGroups: []string{"RESIDENTIAL"}}, + Proxy: types.ApifyProxy{UseApifyProxy: true, ApifyProxyGroups: []string{"RESIDENTIAL"}}, } // Pass the typed request directly to the Apify client From a8d1ab08c9c431a7bdbdf9369361d25d458d6514 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 29 Oct 2025 21:47:46 +0100 Subject: [PATCH 3/5] chore: support twitter metadata --- internal/jobs/twitter.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index 603982b3..11c7bedd 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -38,6 +38,7 @@ func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitters TweetID: tweet.ID, ConversationID: tweet.ConversationID, UserID: tweet.UserID, + AuthorID: tweet.UserID, // For scraper tweets, UserID is the author Text: tweet.Text, CreatedAt: createdAt, Timestamp: tweet.Timestamp, @@ -78,6 +79,15 @@ func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitters RetweetedStatusID: tweet.RetweetedStatusID, Views: tweet.Views, SensitiveContent: tweet.SensitiveContent, + PublicMetrics: types.PublicMetrics{ + LikeCount: tweet.Likes, + ReplyCount: tweet.Replies, + RetweetCount: tweet.Retweets, + QuoteCount: 0, // Not available from scraper + BookmarkCount: 0, // Not available from scraper + ImpressionCount: tweet.Views, // Views maps to impressions + }, + PossiblySensitive: tweet.SensitiveContent, // Map sensitive content flag } } From 310f5044c0e56062cfcb257eb9fd31672ce74698 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 29 Oct 2025 22:12:37 +0100 Subject: [PATCH 4/5] chore: added testing --- internal/jobs/twitter.go | 12 +++++------- internal/jobs/twitter_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index 11c7bedd..fa4dd751 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -38,7 +38,6 @@ func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitters TweetID: tweet.ID, ConversationID: tweet.ConversationID, UserID: tweet.UserID, - AuthorID: tweet.UserID, // For scraper tweets, UserID is the author Text: tweet.Text, CreatedAt: createdAt, Timestamp: tweet.Timestamp, @@ -80,14 +79,13 @@ func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitters Views: tweet.Views, SensitiveContent: tweet.SensitiveContent, PublicMetrics: types.PublicMetrics{ - LikeCount: tweet.Likes, - ReplyCount: tweet.Replies, - RetweetCount: tweet.Retweets, - QuoteCount: 0, // Not available from scraper - BookmarkCount: 0, // Not available from scraper + LikeCount: tweet.Likes, + ReplyCount: tweet.Replies, + RetweetCount: tweet.Retweets, + QuoteCount: 0, // Not available from scraper + BookmarkCount: 0, // Not available from scraper ImpressionCount: tweet.Views, // Views maps to impressions }, - PossiblySensitive: tweet.SensitiveContent, // Map sensitive content flag } } diff --git a/internal/jobs/twitter_test.go b/internal/jobs/twitter_test.go index cdb78869..010ee11f 100644 --- a/internal/jobs/twitter_test.go +++ b/internal/jobs/twitter_test.go @@ -698,6 +698,33 @@ var _ = Describe("Twitter Scraper", func() { Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterTweets]).To(BeNumerically("==", uint(len(results)))) }) + It("should scrape tweets with query", func() { + j := types.Job{ + Type: types.TwitterJob, + Arguments: map[string]interface{}{ + "type": types.CapSearchByQuery, + "query": "from:gopher_ai", + "max_results": 10, + }, + Timeout: 10 * time.Second, + } + res, err := twitterScraper.ExecuteJob(j) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Error).To(BeEmpty()) + + var results []*types.TweetResult + err = res.Unmarshal(&results) + Expect(err).NotTo(HaveOccurred()) + Expect(results).ToNot(BeEmpty()) + + // Wait briefly for asynchronous stats processing to complete + time.Sleep(100 * time.Millisecond) + + Expect(results[0].Text).ToNot(BeEmpty()) + Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterScrapes]).To(BeNumerically("==", 1)) + Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterTweets]).To(BeNumerically("==", uint(len(results)))) + }) + It("should scrape tweets with a search by full archive", func() { Skip("Needs full archive key (elevated) in TWITTER_API_KEYS to run") From eab22fa805b50a59d2b2ac5b6bc07349652613ad Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 3 Nov 2025 17:53:34 +0100 Subject: [PATCH 5/5] chore: add resi proxy to reddit --- internal/jobs/redditapify/client.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/jobs/redditapify/client.go b/internal/jobs/redditapify/client.go index 545537dc..af88ed25 100644 --- a/internal/jobs/redditapify/client.go +++ b/internal/jobs/redditapify/client.go @@ -44,6 +44,7 @@ func (args *CommonArgs) ToActorRequest() RedditActorRequest { MaxComments: args.MaxComments, MaxCommunitiesCount: args.MaxCommunities, MaxUserCount: args.MaxUsers, + Proxy: types.ApifyProxy{UseApifyProxy: true, ApifyProxyGroups: []string{"RESIDENTIAL"}}, } } @@ -67,6 +68,7 @@ type RedditActorRequest struct { SearchUsers bool `json:"searchUsers"` SkipUserPosts bool `json:"skipUserPosts"` SkipComments bool `json:"skipComments"` + Proxy types.ApifyProxy `json:"proxy"` } // RedditApifyClient wraps the generic Apify client for Reddit-specific operations