diff --git a/README.md b/README.md index 1eccb680..db859da6 100644 --- a/README.md +++ b/README.md @@ -209,14 +209,16 @@ Transcribes TikTok videos to text. There are four different types of Reddit searches: -- `scrapeurls`: Gets the content of one or more Reddit URLs +- `scrapeurls`: Gets the content of one or more Reddit URLs. These must be post or comment URLs (no communities or users). - `searchposts`: Searches posts and comments - `searchusers`: Searches user profiles - `searchcommunities`: Searches communities **Parameters** (all are optional except where noted) -- `urls` (array of object with `url` and `query` keys, required for `scrapeurls`): Each element contains a Reddit URL to scrape together with the method (which by default will be `"GET"`). +**Note** Only one of `urls` and `queries` can be provided, depending on the query type. + +- `urls` (array of string, required for `scrapeurls`): Each element contains a Reddit URL to scrape. Only Reddit post and comment URLs are allowed (e.g. `https://reddit.com/r//comments/...`) - `queries` (array of string, required for all job types except `scrapeurls`): Each element is a string to search for. - `sort` (string) What to order by. Possible values are `"relevance"`, `"hot"`, `"top"`, `"new"`, `"rising"` and `"comments"`. - `include_nsfw` (boolean): Whether to include content tagged NSFW. Default is `false`. @@ -240,13 +242,7 @@ There are four different types of Reddit searches: "arguments": { "type": "scrapeurls", "urls": [ - { - "url": "https://reddit.com/r/ArtificialIntelligence", - "method": "GET" - }, - { - "url": "https://reddit.com/u/TheTelegraph" - } + "https://reddit.com/r/ArtificialIntelligence/comments/1n1dwzv/what_math_should_i_focus_on_for_ai_and_why/" ], "sort": "new", "include_nsfw": true, diff --git a/api/types/reddit/reddit.go b/api/types/reddit/reddit.go index b8c4fcb8..568e6e80 100644 --- a/api/types/reddit/reddit.go +++ b/api/types/reddit/reddit.go @@ -4,47 +4,8 @@ import ( "encoding/json" "fmt" "time" - - "github.com/masa-finance/tee-types/pkg/util" -) - -type QueryType string - -const ( - ScrapeUrls QueryType = "scrapeurls" - SearchPosts QueryType = "searchposts" - SearchUsers QueryType = "searchusers" - SearchCommunities QueryType = "searchcommunities" -) - -var AllQueryTypes = util.NewSet(ScrapeUrls, SearchPosts, SearchUsers, SearchCommunities) - -type SortType string - -const ( - SortRelevance SortType = "relevance" - SortHot SortType = "hot" - SortTop SortType = "top" - SortNew SortType = "new" - SortRising SortType = "rising" - SortComments SortType = "comments" ) -var AllSortTypes = util.NewSet( - SortRelevance, - SortHot, - SortTop, - SortNew, - SortRising, - SortComments, -) - -// StartURL represents a single start URL for the Apify Reddit scraper. -type StartURL struct { - URL string `json:"url"` - Method string `json:"method"` -} - type ResponseType string const ( diff --git a/go.mod b/go.mod index a899daea..9cd9facf 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/joho/godotenv v1.5.1 github.com/labstack/echo-contrib v0.17.4 github.com/labstack/echo/v4 v4.13.4 - github.com/masa-finance/tee-types v1.1.10 + github.com/masa-finance/tee-types v1.1.12 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.38.0 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 5c3c5310..137693b8 100644 --- a/go.sum +++ b/go.sum @@ -57,8 +57,8 @@ github.com/labstack/echo/v4 v4.13.4 h1:oTZZW+T3s9gAu5L8vmzihV7/lkXGZuITzTQkTEhcX github.com/labstack/echo/v4 v4.13.4/go.mod h1:g63b33BZ5vZzcIUF8AtRH40DrTlXnx4UMC8rBdndmjQ= github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= -github.com/masa-finance/tee-types v1.1.10 h1:mn/wF84Yg6tXH+JigRwluWaBwHT8SeNMzGVDgclC+08= -github.com/masa-finance/tee-types v1.1.10/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= +github.com/masa-finance/tee-types v1.1.12 h1:SXBYJu76EM5JCujiso6ytBqEoYCvhDe1vvNFIXTmu6w= +github.com/masa-finance/tee-types v1.1.12/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index 285c19f7..8dd58b1c 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -76,7 +76,15 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { switch redditArgs.QueryType { case teetypes.RedditScrapeUrls: - resp, cursor, err := redditClient.ScrapeUrls(j.WorkerID, redditArgs.URLs, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) + urls := make([]teetypes.RedditStartURL, 0, len(redditArgs.URLs)) + for _, u := range redditArgs.URLs { + urls = append(urls, teetypes.RedditStartURL{ + URL: u, + Method: "GET", + }) + } + + resp, cursor, err := redditClient.ScrapeUrls(j.WorkerID, urls, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) case teetypes.RedditSearchUsers: diff --git a/internal/jobs/reddit_test.go b/internal/jobs/reddit_test.go index 3b9349e1..3fd639e8 100644 --- a/internal/jobs/reddit_test.go +++ b/internal/jobs/reddit_test.go @@ -95,14 +95,17 @@ var _ = Describe("RedditScraper", func() { }) It("should call ScrapeUrls for the correct QueryType", func() { + testUrls := []string{ + "https://www.reddit.com/r/HHGTTG/comments/1jynlrz/the_entire_series_after_restaurant_at_the_end_of/", + } job.Arguments = map[string]any{ "type": teetypes.RedditScrapeUrls, - "urls": []teetypes.RedditStartURL{{URL: "https://www.reddit.com/u/zaphod/", Method: "GET"}}, + "urls": testUrls, } mockClient.ScrapeUrlsFunc = func(urls []teetypes.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { Expect(urls).To(HaveLen(1)) - Expect(urls[0].URL).To(Equal("https://www.reddit.com/u/zaphod/")) + Expect(urls[0].URL).To(Equal(testUrls[0])) return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.UserResponse}, User: &reddit.User{ID: "user1", DataType: string(reddit.UserResponse)}}}, "next", nil } diff --git a/internal/jobs/tiktok_transcription.go b/internal/jobs/tiktok_transcription.go index ba893a13..aebbde00 100644 --- a/internal/jobs/tiktok_transcription.go +++ b/internal/jobs/tiktok_transcription.go @@ -111,7 +111,7 @@ func (ttt *TikTokTranscriber) ExecuteJob(j types.Job) (types.JobResult, error) { } // Type assert to TikTok arguments - tiktokArgs, ok := teeargs.AsTikTokArguments(jobArgs) + tiktokArgs, ok := jobArgs.(*teeargs.TikTokTranscriptionArguments) if !ok { return types.JobResult{Error: "invalid argument type for TikTok job"}, fmt.Errorf("invalid argument type") } diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index cfb59eab..8ffe6e71 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -1351,20 +1351,17 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } // Type assert to Twitter arguments - twitterArgs, ok := teeargs.AsTwitterArguments(jobArgs) + args, ok := jobArgs.(*teeargs.TwitterSearchArguments) if !ok { logrus.Errorf("Expected Twitter arguments for job ID %s, type %s", j.UUID, j.Type) return types.JobResult{Error: "invalid argument type for Twitter job"}, fmt.Errorf("invalid argument type") } // Log the capability for debugging - logrus.Debugf("Executing Twitter job ID %s with capability: %s", j.UUID, twitterArgs.GetCapability()) + logrus.Debugf("Executing Twitter job ID %s with capability: %s", j.UUID, args.GetCapability()) strategy := getScrapeStrategy(j.Type) - // Convert to concrete type for direct usage - args := twitterArgs.(*teeargs.TwitterSearchArguments) - jobResult, err := strategy.Execute(j, ts, args) if err != nil { logrus.Errorf("Error executing job ID %s, type %s: %v", j.UUID, j.Type, err) @@ -1378,37 +1375,37 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } switch { - case twitterArgs.IsSingleTweetOperation(): + case args.IsSingleTweetOperation(): var result *teetypes.TweetResult if err := jobResult.Unmarshal(&result); err != nil { logrus.Errorf("Error while unmarshalling single tweet result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling single tweet result for final validation"}, err } - case twitterArgs.IsMultipleTweetOperation(): + case args.IsMultipleTweetOperation(): var results []*teetypes.TweetResult if err := jobResult.Unmarshal(&results); err != nil { logrus.Errorf("Error while unmarshalling multiple tweet result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling multiple tweet result for final validation"}, err } - case twitterArgs.IsSingleProfileOperation(): + case args.IsSingleProfileOperation(): var result *twitterscraper.Profile if err := jobResult.Unmarshal(&result); err != nil { logrus.Errorf("Error while unmarshalling single profile result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling single profile result for final validation"}, err } - case twitterArgs.IsMultipleProfileOperation(): + case args.IsMultipleProfileOperation(): var results []*twitterscraper.Profile if err := jobResult.Unmarshal(&results); err != nil { logrus.Errorf("Error while unmarshalling multiple profile result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling multiple profile result for final validation"}, err } - case twitterArgs.IsSingleSpaceOperation(): + case args.IsSingleSpaceOperation(): var result *twitterscraper.Space if err := jobResult.Unmarshal(&result); err != nil { logrus.Errorf("Error while unmarshalling single space result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling single space result for final validation"}, err } - case twitterArgs.IsTrendsOperation(): + case args.IsTrendsOperation(): var results []string if err := jobResult.Unmarshal(&results); err != nil { logrus.Errorf("Error while unmarshalling trends result for job ID %s, type %s: %v", j.UUID, j.Type, err) diff --git a/internal/jobs/webscraper.go b/internal/jobs/webscraper.go index 7e4d652c..a90fe55d 100644 --- a/internal/jobs/webscraper.go +++ b/internal/jobs/webscraper.go @@ -55,7 +55,7 @@ func (ws *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } // Type assert to Web arguments - args, ok := teeargs.AsWebArguments(jobArgs) + args, ok := jobArgs.(*teeargs.WebSearchArguments) if !ok { logrus.Errorf("Expected Web arguments for job ID %s, type %s", j.UUID, j.Type) return types.JobResult{Error: "invalid argument type for Web job"}, nil