Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,14 +209,16 @@ Transcribes TikTok videos to text.

There are four different types of Reddit searches:

- `scrapeurls`: Gets the content of one or more Reddit URLs
- `scrapeurls`: Gets the content of one or more Reddit URLs. These must be post or comment URLs (no communities or users).
- `searchposts`: Searches posts and comments
- `searchusers`: Searches user profiles
- `searchcommunities`: Searches communities

**Parameters** (all are optional except where noted)

- `urls` (array of object with `url` and `query` keys, required for `scrapeurls`): Each element contains a Reddit URL to scrape together with the method (which by default will be `"GET"`).
**Note** Only one of `urls` and `queries` can be provided, depending on the query type.

- `urls` (array of string, required for `scrapeurls`): Each element contains a Reddit URL to scrape. Only Reddit post and comment URLs are allowed (e.g. `https://reddit.com/r/<community>/comments/...`)
- `queries` (array of string, required for all job types except `scrapeurls`): Each element is a string to search for.
- `sort` (string) What to order by. Possible values are `"relevance"`, `"hot"`, `"top"`, `"new"`, `"rising"` and `"comments"`.
- `include_nsfw` (boolean): Whether to include content tagged NSFW. Default is `false`.
Expand All @@ -240,13 +242,7 @@ There are four different types of Reddit searches:
"arguments": {
"type": "scrapeurls",
"urls": [
{
"url": "https://reddit.com/r/ArtificialIntelligence",
"method": "GET"
},
{
"url": "https://reddit.com/u/TheTelegraph"
}
"https://reddit.com/r/ArtificialIntelligence/comments/1n1dwzv/what_math_should_i_focus_on_for_ai_and_why/"
],
"sort": "new",
"include_nsfw": true,
Expand Down
39 changes: 0 additions & 39 deletions api/types/reddit/reddit.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,8 @@ import (
"encoding/json"
"fmt"
"time"

"github.com/masa-finance/tee-types/pkg/util"
)

type QueryType string

const (
ScrapeUrls QueryType = "scrapeurls"
SearchPosts QueryType = "searchposts"
SearchUsers QueryType = "searchusers"
SearchCommunities QueryType = "searchcommunities"
)

var AllQueryTypes = util.NewSet(ScrapeUrls, SearchPosts, SearchUsers, SearchCommunities)

type SortType string

const (
SortRelevance SortType = "relevance"
SortHot SortType = "hot"
SortTop SortType = "top"
SortNew SortType = "new"
SortRising SortType = "rising"
SortComments SortType = "comments"
)

var AllSortTypes = util.NewSet(
SortRelevance,
SortHot,
SortTop,
SortNew,
SortRising,
SortComments,
)

// StartURL represents a single start URL for the Apify Reddit scraper.
type StartURL struct {
URL string `json:"url"`
Method string `json:"method"`
}

type ResponseType string

const (
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/joho/godotenv v1.5.1
github.com/labstack/echo-contrib v0.17.4
github.com/labstack/echo/v4 v4.13.4
github.com/masa-finance/tee-types v1.1.10
github.com/masa-finance/tee-types v1.1.12
github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.38.0
github.com/sirupsen/logrus v1.9.3
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ github.com/labstack/echo/v4 v4.13.4 h1:oTZZW+T3s9gAu5L8vmzihV7/lkXGZuITzTQkTEhcX
github.com/labstack/echo/v4 v4.13.4/go.mod h1:g63b33BZ5vZzcIUF8AtRH40DrTlXnx4UMC8rBdndmjQ=
github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
github.com/masa-finance/tee-types v1.1.10 h1:mn/wF84Yg6tXH+JigRwluWaBwHT8SeNMzGVDgclC+08=
github.com/masa-finance/tee-types v1.1.10/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc=
github.com/masa-finance/tee-types v1.1.12 h1:SXBYJu76EM5JCujiso6ytBqEoYCvhDe1vvNFIXTmu6w=
github.com/masa-finance/tee-types v1.1.12/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc=
github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk=
github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8=
github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
Expand Down
10 changes: 9 additions & 1 deletion internal/jobs/reddit.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,15 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) {

switch redditArgs.QueryType {
case teetypes.RedditScrapeUrls:
resp, cursor, err := redditClient.ScrapeUrls(j.WorkerID, redditArgs.URLs, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults)
urls := make([]teetypes.RedditStartURL, 0, len(redditArgs.URLs))
for _, u := range redditArgs.URLs {
urls = append(urls, teetypes.RedditStartURL{
URL: u,
Method: "GET",
})
}

resp, cursor, err := redditClient.ScrapeUrls(j.WorkerID, urls, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults)
return processRedditResponse(j, resp, cursor, err)

case teetypes.RedditSearchUsers:
Expand Down
7 changes: 5 additions & 2 deletions internal/jobs/reddit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,17 @@ var _ = Describe("RedditScraper", func() {
})

It("should call ScrapeUrls for the correct QueryType", func() {
testUrls := []string{
"https://www.reddit.com/r/HHGTTG/comments/1jynlrz/the_entire_series_after_restaurant_at_the_end_of/",
}
job.Arguments = map[string]any{
"type": teetypes.RedditScrapeUrls,
"urls": []teetypes.RedditStartURL{{URL: "https://www.reddit.com/u/zaphod/", Method: "GET"}},
"urls": testUrls,
}

mockClient.ScrapeUrlsFunc = func(urls []teetypes.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) {
Expect(urls).To(HaveLen(1))
Expect(urls[0].URL).To(Equal("https://www.reddit.com/u/zaphod/"))
Expect(urls[0].URL).To(Equal(testUrls[0]))
return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.UserResponse}, User: &reddit.User{ID: "user1", DataType: string(reddit.UserResponse)}}}, "next", nil
}

Expand Down
2 changes: 1 addition & 1 deletion internal/jobs/tiktok_transcription.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func (ttt *TikTokTranscriber) ExecuteJob(j types.Job) (types.JobResult, error) {
}

// Type assert to TikTok arguments
tiktokArgs, ok := teeargs.AsTikTokArguments(jobArgs)
tiktokArgs, ok := jobArgs.(*teeargs.TikTokTranscriptionArguments)
if !ok {
return types.JobResult{Error: "invalid argument type for TikTok job"}, fmt.Errorf("invalid argument type")
}
Expand Down
19 changes: 8 additions & 11 deletions internal/jobs/twitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -1351,20 +1351,17 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) {
}

// Type assert to Twitter arguments
twitterArgs, ok := teeargs.AsTwitterArguments(jobArgs)
args, ok := jobArgs.(*teeargs.TwitterSearchArguments)
if !ok {
logrus.Errorf("Expected Twitter arguments for job ID %s, type %s", j.UUID, j.Type)
return types.JobResult{Error: "invalid argument type for Twitter job"}, fmt.Errorf("invalid argument type")
}

// Log the capability for debugging
logrus.Debugf("Executing Twitter job ID %s with capability: %s", j.UUID, twitterArgs.GetCapability())
logrus.Debugf("Executing Twitter job ID %s with capability: %s", j.UUID, args.GetCapability())

strategy := getScrapeStrategy(j.Type)

// Convert to concrete type for direct usage
args := twitterArgs.(*teeargs.TwitterSearchArguments)

jobResult, err := strategy.Execute(j, ts, args)
if err != nil {
logrus.Errorf("Error executing job ID %s, type %s: %v", j.UUID, j.Type, err)
Expand All @@ -1378,37 +1375,37 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) {
}

switch {
case twitterArgs.IsSingleTweetOperation():
case args.IsSingleTweetOperation():
var result *teetypes.TweetResult
if err := jobResult.Unmarshal(&result); err != nil {
logrus.Errorf("Error while unmarshalling single tweet result for job ID %s, type %s: %v", j.UUID, j.Type, err)
return types.JobResult{Error: "error unmarshalling single tweet result for final validation"}, err
}
case twitterArgs.IsMultipleTweetOperation():
case args.IsMultipleTweetOperation():
var results []*teetypes.TweetResult
if err := jobResult.Unmarshal(&results); err != nil {
logrus.Errorf("Error while unmarshalling multiple tweet result for job ID %s, type %s: %v", j.UUID, j.Type, err)
return types.JobResult{Error: "error unmarshalling multiple tweet result for final validation"}, err
}
case twitterArgs.IsSingleProfileOperation():
case args.IsSingleProfileOperation():
var result *twitterscraper.Profile
if err := jobResult.Unmarshal(&result); err != nil {
logrus.Errorf("Error while unmarshalling single profile result for job ID %s, type %s: %v", j.UUID, j.Type, err)
return types.JobResult{Error: "error unmarshalling single profile result for final validation"}, err
}
case twitterArgs.IsMultipleProfileOperation():
case args.IsMultipleProfileOperation():
var results []*twitterscraper.Profile
if err := jobResult.Unmarshal(&results); err != nil {
logrus.Errorf("Error while unmarshalling multiple profile result for job ID %s, type %s: %v", j.UUID, j.Type, err)
return types.JobResult{Error: "error unmarshalling multiple profile result for final validation"}, err
}
case twitterArgs.IsSingleSpaceOperation():
case args.IsSingleSpaceOperation():
var result *twitterscraper.Space
if err := jobResult.Unmarshal(&result); err != nil {
logrus.Errorf("Error while unmarshalling single space result for job ID %s, type %s: %v", j.UUID, j.Type, err)
return types.JobResult{Error: "error unmarshalling single space result for final validation"}, err
}
case twitterArgs.IsTrendsOperation():
case args.IsTrendsOperation():
var results []string
if err := jobResult.Unmarshal(&results); err != nil {
logrus.Errorf("Error while unmarshalling trends result for job ID %s, type %s: %v", j.UUID, j.Type, err)
Expand Down
2 changes: 1 addition & 1 deletion internal/jobs/webscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func (ws *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) {
}

// Type assert to Web arguments
args, ok := teeargs.AsWebArguments(jobArgs)
args, ok := jobArgs.(*teeargs.WebSearchArguments)
if !ok {
logrus.Errorf("Expected Web arguments for job ID %s, type %s", j.UUID, j.Type)
return types.JobResult{Error: "invalid argument type for Web job"}, nil
Expand Down
Loading