From 8e2207f54b12cc6a78d0171c9d2db66409fc389e Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Tue, 13 May 2025 14:33:46 -0400 Subject: [PATCH 001/136] first commit --- .idea/.gitignore | 8 +++ .idea/modules.xml | 8 +++ .idea/tee-types.iml | 9 +++ .idea/vcs.xml | 4 ++ README.md | 46 ++++++++++++++ api/types/encrypted.go | 12 ++++ api/types/job.go | 101 +++++++++++++++++++++++++++++++ api/types/key.go | 13 ++++ crypto/keyring.go | 60 ++++++++++++++++++ go.mod | 5 ++ go.sum | 2 + job/telemetry.go | 14 +++++ job/twitter.go | 51 ++++++++++++++++ job/types.go | 134 +++++++++++++++++++++++++++++++++++++++++ job/web.go | 16 +++++ jobs/telemetry.go | 14 +++++ jobs/twitter.go | 121 +++++++++++++++++++++++++++++++++++++ jobs/webscraper.go | 30 +++++++++ types/crypto.go | 13 ++++ types/job.go | 38 ++++++++++++ 20 files changed, 699 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/modules.xml create mode 100644 .idea/tee-types.iml create mode 100644 .idea/vcs.xml create mode 100644 README.md create mode 100644 api/types/encrypted.go create mode 100644 api/types/job.go create mode 100644 api/types/key.go create mode 100644 crypto/keyring.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 job/telemetry.go create mode 100644 job/twitter.go create mode 100644 job/types.go create mode 100644 job/web.go create mode 100644 jobs/telemetry.go create mode 100644 jobs/twitter.go create mode 100644 jobs/webscraper.go create mode 100644 types/crypto.go create mode 100644 types/job.go diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..66531a7 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/tee-types.iml b/.idea/tee-types.iml new file mode 100644 index 0000000..5e764c4 --- /dev/null +++ b/.idea/tee-types.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..d843f34 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..eb0982d --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# tee-types + +A shared type definitions package for Masa Finance TEE projects. + +## Minimal Sharing Approach + +This package follows a minimalist approach, sharing only the essential types needed for the interface between tee-worker and tee-indexer. This approach reduces coupling between the services while ensuring consistent communication. + +Each service should implement their own internal types that extend or build upon these shared types as needed. + +## Structure + +*WIP* + +## Usage + +To use this package in your project, add it as a dependency: + +```bash +go get github.com/masa-finance/tee-types +``` + +Then import the required packages: + +```go +import "github.com/masa-finance/tee-types/types" +``` + +## Types Included + +### Core Types (`types/job.go`) + +- `Job`: Represents a task to be executed by a worker +- `JobResult`: Represents the result of executing a job +- `JobArguments`: Map type for job arguments +- `JobRequest`: Represents a request to execute a job +- Job type constants for common job types (Web, Twitter) + +### Cryptographic Types (`types/crypto.go`) + +- `EncryptedRequest`: For secure job requests +- `Key`: Basic key representation + +## Contributing + +When adding new features to tee-worker or tee-indexer, consider whether the types should be added to this shared package first. \ No newline at end of file diff --git a/api/types/encrypted.go b/api/types/encrypted.go new file mode 100644 index 0000000..28dfe71 --- /dev/null +++ b/api/types/encrypted.go @@ -0,0 +1,12 @@ +package types + +// EncryptedRequest represents an encrypted request and result +type EncryptedRequest struct { + EncryptedResult string `json:"encrypted_result"` + EncryptedRequest string `json:"encrypted_request"` +} + +// JobError represents an error response for a job +type JobError struct { + Error string `json:"error"` +} diff --git a/api/types/job.go b/api/types/job.go new file mode 100644 index 0000000..d98ece9 --- /dev/null +++ b/api/types/job.go @@ -0,0 +1,101 @@ +// Package types defines common type definitions used across tee services +package types + +import ( + "crypto/sha256" + "encoding/json" + "fmt" + "golang.org/x/exp/rand" +) + +// JobArguments represents arguments passed to a job +type JobArguments map[string]interface{} + +// Unmarshal unmarshals job arguments into the supplied interface +func (ja JobArguments) Unmarshal(i interface{}) error { + dat, err := json.Marshal(ja) + if err != nil { + return err + } + return json.Unmarshal(dat, i) +} + +// Job represents a task to be executed by a worker +type Job struct { + Type string `json:"type"` + Arguments JobArguments `json:"arguments"` + UUID string `json:"-"` + Nonce string `json:"quote"` + WorkerID string `json:"worker_id"` +} + +var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") + +func randStringRunes(n int) string { + b := make([]rune, n) + for i := range b { + b[i] = letterRunes[rand.Intn(len(letterRunes))] + } + return string(b) +} + +// GenerateJobSignature generates a signature for the job. +// Note: This method will need to be adjusted when used in actual implementations +// to use the appropriate sealing mechanism. +func (job *Job) GenerateJobSignature() (string, error) { + dat, err := json.Marshal(job) + if err != nil { + return "", err + } + + checksum := sha256.New() + checksum.Write(dat) + + job.Nonce = fmt.Sprintf("%s-%s", string(checksum.Sum(nil)), randStringRunes(99)) + + return job.Nonce, nil +} + +// JobResponse represents a response to a job submission +type JobResponse struct { + UID string `json:"uid"` +} + +// JobResult represents the result of executing a job +type JobResult struct { + Error string `json:"error"` + Data []byte `json:"data"` + Job Job `json:"job"` + NextCursor string `json:"next_cursor"` +} + +// Success returns true if the job was successful. +func (jr JobResult) Success() bool { + return jr.Error == "" +} + +// Unmarshal unmarshals the job result data. +func (jr JobResult) Unmarshal(i interface{}) error { + return json.Unmarshal(jr.Data, i) +} + +// JobRequest represents a request to execute a job +type JobRequest struct { + EncryptedJob string `json:"encrypted_job"` +} + +// JobConfiguration represents configuration for a job +type JobConfiguration map[string]interface{} + +// Unmarshal unmarshals the job configuration into the supplied interface. +func (jc JobConfiguration) Unmarshal(v interface{}) error { + data, err := json.Marshal(jc) + if err != nil { + return fmt.Errorf("error marshalling job configuration: %w", err) + } + if err := json.Unmarshal(data, v); err != nil { + return fmt.Errorf("error unmarshalling job configuration: %w", err) + } + + return nil +} diff --git a/api/types/key.go b/api/types/key.go new file mode 100644 index 0000000..3a9616e --- /dev/null +++ b/api/types/key.go @@ -0,0 +1,13 @@ +package types + +// Key represents an encryption key +type Key struct { + Key string `json:"key"` + ID string `json:"id"` +} + +// KeyResponse represents a response when requesting a key +type KeyResponse struct { + Key string `json:"key"` + ID string `json:"id"` +} diff --git a/crypto/keyring.go b/crypto/keyring.go new file mode 100644 index 0000000..a698a6c --- /dev/null +++ b/crypto/keyring.go @@ -0,0 +1,60 @@ +// Package crypto contains cryptographic interfaces and helpers for secure operations +package crypto + +// KeyType defines the type of a key in the keyring +type KeyType string + +const ( + // SealingKeyType is used for sealing operations + SealingKeyType KeyType = "sealing" + + // EncryptionKeyType is used for encryption operations + EncryptionKeyType KeyType = "encryption" + + // SigningKeyType is used for signing operations + SigningKeyType KeyType = "signing" +) + +// Key represents a cryptographic key in the keyring +type Key struct { + ID string `json:"id"` + Type KeyType `json:"type"` + Data []byte `json:"data"` +} + +// KeyRing defines the interface for key management +// This follows the modern approach of using a KeyRing instead of single keys +type KeyRing interface { + // AddKey adds a key to the keyring + AddKey(key Key) error + + // GetKey retrieves a key from the keyring by ID + GetKey(id string) (Key, error) + + // GetKeysByType retrieves all keys of a specific type + GetKeysByType(keyType KeyType) ([]Key, error) + + // RemoveKey removes a key from the keyring by ID + RemoveKey(id string) error + + // Seal seals data using the keyring + Seal(data []byte) ([]byte, error) + + // Unseal unseals data using the keyring + Unseal(data []byte) ([]byte, error) + + // SealWithKey seals data using a specific key from the keyring + SealWithKey(keyID string, data []byte) ([]byte, error) + + // UnsealWithKey unseals data using a specific key from the keyring + UnsealWithKey(keyID string, data []byte) ([]byte, error) +} + +// TEEOptions defines options for TEE operations +type TEEOptions struct { + // IsTestEnvironment indicates if this is a test environment + IsTestEnvironment bool + + // KeysDirectory specifies the directory for storing keys + KeysDirectory string +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..4cb1723 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/masa-finance/tee-types + +go 1.24.0 + +require golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2fbef45 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 h1:y5zboxd6LQAqYIhHnB48p0ByQ/GnQx2BE33L8BOHQkI= +golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6/go.mod h1:U6Lno4MTRCDY+Ba7aCcauB9T60gsv5s4ralQzP72ZoQ= diff --git a/job/telemetry.go b/job/telemetry.go new file mode 100644 index 0000000..f2fc5e7 --- /dev/null +++ b/job/telemetry.go @@ -0,0 +1,14 @@ +package job + +// TelemetryJobType represents the job type for telemetry operations +const TelemetryJobType = "telemetry" + +// TelemetryConfiguration defines configuration for telemetry jobs +type TelemetryConfiguration struct { + StatsInterval int `json:"stats_interval"` +} + +// TelemetryResult represents the result of a telemetry operation +type TelemetryResult struct { + Stats map[string]uint `json:"stats"` +} diff --git a/job/twitter.go b/job/twitter.go new file mode 100644 index 0000000..ca0d5be --- /dev/null +++ b/job/twitter.go @@ -0,0 +1,51 @@ +package job + +// TwitterJobTypes defines the various types of Twitter jobs +const ( + // TwitterScraperType represents standard Twitter scraping jobs + TwitterScraperType = "twitter" + + // TwitterCredentialScraperType represents Twitter scraping jobs using credentials + TwitterCredentialScraperType = "twitter-credential" + + // TwitterApiScraperType represents Twitter scraping jobs using API keys + TwitterApiScraperType = "twitter-api" +) + +// TwitterKeyAuthType defines the type of authentication for Twitter API keys +type TwitterKeyAuthType string + +// TwitterKeyAuthType constants +const ( + CredentialAuthType TwitterKeyAuthType = "credential" + ApiKeyAuthType TwitterKeyAuthType = "apikey" + UnknownAuthType TwitterKeyAuthType = "unknown" +) + +// TwitterApiKeyType defines the type of Twitter API key +type TwitterApiKeyType string + +// TwitterApiKeyType constants +const ( + TwitterApiKeyTypeBase TwitterApiKeyType = "base" + TwitterApiKeyTypeElevated TwitterApiKeyType = "elevated" + TwitterApiKeyTypeCredential TwitterApiKeyType = "credential" + TwitterApiKeyTypeUnknown TwitterApiKeyType = "unknown" +) + +// TwitterScraperConfiguration defines configuration for Twitter scraping +type TwitterScraperConfiguration struct { + Accounts []string `json:"twitter_accounts"` + ApiKeys []string `json:"twitter_api_keys"` + DataDir string `json:"data_dir"` + SkipLoginVerification bool `json:"skip_login_verification,omitempty"` // If true, skips Twitter's verify_credentials check +} + +// TwitterScraperArgs defines arguments for Twitter scraping jobs +type TwitterScraperArgs struct { + SearchType string `json:"type"` + Query string `json:"query"` + Count int `json:"count"` + MaxResults int `json:"max_results"` + NextCursor string `json:"next_cursor"` +} diff --git a/job/types.go b/job/types.go new file mode 100644 index 0000000..09c4b60 --- /dev/null +++ b/job/types.go @@ -0,0 +1,134 @@ +// Package job contains the core job types shared across TEE services +package job + +import ( + "crypto/sha256" + "encoding/json" + "fmt" + "golang.org/x/exp/rand" +) + +// Version information +const ( + VersionMajor = 0 + VersionMinor = 1 + VersionPatch = 0 +) + +// GetVersion returns the semantic version string +func GetVersion() string { + return fmt.Sprintf("%d.%d.%d", VersionMajor, VersionMinor, VersionPatch) +} + +// Arguments represents arguments passed to a job +type Arguments map[string]interface{} + +// Unmarshal unmarshals job arguments into the supplied interface +func (ja Arguments) Unmarshal(i interface{}) error { + dat, err := json.Marshal(ja) + if err != nil { + return err + } + return json.Unmarshal(dat, i) +} + +// Job represents a task to be executed by a worker +type Job struct { + Type string `json:"type"` + Arguments Arguments `json:"arguments"` + UUID string `json:"-"` + Nonce string `json:"quote"` + WorkerID string `json:"worker_id"` +} + +var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") + +func randStringRunes(n int) string { + b := make([]rune, n) + for i := range b { + b[i] = letterRunes[rand.Intn(len(letterRunes))] + } + return string(b) +} + +// GenerateJobSignature generates a signature for the job. +// Note: This method is a placeholder. Each service will need to implement +// its own version with the appropriate sealing mechanism. +func (job *Job) GenerateJobSignature() (string, error) { + dat, err := json.Marshal(job) + if err != nil { + return "", err + } + + checksum := sha256.New() + checksum.Write(dat) + + job.Nonce = fmt.Sprintf("%s-%s", string(checksum.Sum(nil)), randStringRunes(99)) + + return job.Nonce, nil +} + +// Response represents a response to a job submission +type Response struct { + UID string `json:"uid"` +} + +// Result represents the result of executing a job +type Result struct { + Error string `json:"error"` + Data []byte `json:"data"` + Job Job `json:"job"` + NextCursor string `json:"next_cursor"` +} + +// Success returns true if the job was successful. +func (jr Result) Success() bool { + return jr.Error == "" +} + +// Unmarshal unmarshals the job result data. +func (jr Result) Unmarshal(i interface{}) error { + return json.Unmarshal(jr.Data, i) +} + +// Request represents a request to execute a job +type Request struct { + EncryptedJob string `json:"encrypted_job"` +} + +// Configuration represents configuration for a job +type Configuration map[string]interface{} + +// Unmarshal unmarshals the job configuration into the supplied interface. +func (jc Configuration) Unmarshal(v interface{}) error { + data, err := json.Marshal(jc) + if err != nil { + return fmt.Errorf("error marshalling job configuration: %w", err) + } + if err := json.Unmarshal(data, v); err != nil { + return fmt.Errorf("error unmarshalling job configuration: %w", err) + } + + return nil +} + +// Parameters defines the base interface for job parameters +type Parameters interface { + // GetIdentifier returns a unique identifier for the job parameters + GetIdentifier() string +} + +// Status defines the base interface for job status +type Status interface { + // GetError returns the job error + GetError() string + + // SetError sets the job error + SetError(err string) + + // GetStatus returns the job status + GetStatus() string + + // SetStatus sets the job status + SetStatus(status string) +} diff --git a/job/web.go b/job/web.go new file mode 100644 index 0000000..6fd1445 --- /dev/null +++ b/job/web.go @@ -0,0 +1,16 @@ +package job + +// WebScraperType represents the job type for web scraping +const WebScraperType = "webscraper" + +// WebScraperConfiguration defines configuration for web scraping +type WebScraperConfiguration struct { + Blacklist []string `json:"blacklist"` +} + +// WebScraperArgs defines arguments for web scraping jobs +type WebScraperArgs struct { + URL string `json:"url"` + Selector string `json:"selector"` + MaxDepth int `json:"max_depth"` +} diff --git a/jobs/telemetry.go b/jobs/telemetry.go new file mode 100644 index 0000000..55c249c --- /dev/null +++ b/jobs/telemetry.go @@ -0,0 +1,14 @@ +package jobs + +// TelemetryJobType represents the job type for telemetry operations +const TelemetryJobType = "telemetry" + +// TelemetryConfiguration defines configuration for telemetry jobs +type TelemetryConfiguration struct { + StatsInterval int `json:"stats_interval"` +} + +// TelemetryResult represents the result of a telemetry operation +type TelemetryResult struct { + Stats map[string]uint `json:"stats"` +} diff --git a/jobs/twitter.go b/jobs/twitter.go new file mode 100644 index 0000000..102f999 --- /dev/null +++ b/jobs/twitter.go @@ -0,0 +1,121 @@ +package jobs + +import ( + "time" +) + +// TwitterJobTypes defines the various types of Twitter jobs +const ( + // TwitterScraperType represents standard Twitter scraping jobs + TwitterScraperType = "twitter" + + // TwitterCredentialScraperType represents Twitter scraping jobs using credentials + TwitterCredentialScraperType = "twitter-credential" + + // TwitterApiScraperType represents Twitter scraping jobs using API keys + TwitterApiScraperType = "twitter-api" +) + +// TwitterKeyAuthType defines the type of authentication for Twitter API keys +type TwitterKeyAuthType string + +// TwitterKeyAuthType constants +const ( + CredentialAuthType TwitterKeyAuthType = "credential" + ApiKeyAuthType TwitterKeyAuthType = "apikey" + UnknownAuthType TwitterKeyAuthType = "unknown" +) + +// TwitterApiKeyType defines the type of Twitter API key +type TwitterApiKeyType string + +// TwitterApiKeyType constants +const ( + TwitterApiKeyTypeBase TwitterApiKeyType = "base" + TwitterApiKeyTypeElevated TwitterApiKeyType = "elevated" + TwitterApiKeyTypeCredential TwitterApiKeyType = "credential" + TwitterApiKeyTypeUnknown TwitterApiKeyType = "unknown" +) + +// TwitterScraperConfiguration defines configuration for Twitter scraping +type TwitterScraperConfiguration struct { + Accounts []string `json:"twitter_accounts"` + ApiKeys []string `json:"twitter_api_keys"` + DataDir string `json:"data_dir"` + SkipLoginVerification bool `json:"skip_login_verification,omitempty"` // If true, skips Twitter's verify_credentials check +} + +// TwitterScraperArgs defines arguments for Twitter scraping jobs +type TwitterScraperArgs struct { + SearchType string `json:"type"` + Query string `json:"query"` + Count int `json:"count"` + MaxResults int `json:"max_results"` + NextCursor string `json:"next_cursor"` +} + +// TweetResult represents a Tweet returned from Twitter +type TweetResult struct { + ID int64 `json:"id"` + TweetID string `json:"tweet_id"` + ConversationID string `json:"conversation_id"` + UserID string `json:"user_id"` + Text string `json:"text"` + CreatedAt time.Time `json:"created_at"` + Timestamp int64 `json:"timestamp"` + + IsQuoted bool `json:"is_quoted"` + IsPin bool `json:"is_pin"` + IsReply bool `json:"is_reply"` + IsRetweet bool `json:"is_retweet"` + IsSelfThread bool `json:"is_self_thread"` + Likes int `json:"likes"` + Hashtags []string `json:"hashtags"` + HTML string `json:"html"` + Replies int `json:"replies"` + Retweets int `json:"retweets"` + URLs []string `json:"urls"` + Username string `json:"username"` + + Photos []Photo `json:"photos"` + Videos []Video `json:"videos"` + + RetweetedStatusID string `json:"retweeted_status_id"` + Views int `json:"views"` + SensitiveContent bool `json:"sensitive_content"` + + // Fields from TwitterX API + AuthorID string `json:"author_id"` + PublicMetrics PublicMetrics `json:"public_metrics"` + PossiblySensitive bool `json:"possibly_sensitive"` + Lang string `json:"lang"` + NewestID string `json:"newest_id"` + OldestID string `json:"oldest_id"` + ResultCount int `json:"result_count"` + + Error error `json:"-"` +} + +// PublicMetrics represents public metrics for a Tweet +type PublicMetrics struct { + RetweetCount int `json:"retweet_count"` + ReplyCount int `json:"reply_count"` + LikeCount int `json:"like_count"` + QuoteCount int `json:"quote_count"` + BookmarkCount int `json:"bookmark_count"` + ImpressionCount int `json:"impression_count"` +} + +// Photo represents an image attached to a Tweet +type Photo struct { + ID string `json:"id"` + URL string `json:"url"` +} + +// Video represents a video attached to a Tweet +type Video struct { + ID string `json:"id"` + Preview string `json:"preview"` + URL string `json:"url"` + HLSURL string `json:"hls_url"` +} diff --git a/jobs/webscraper.go b/jobs/webscraper.go new file mode 100644 index 0000000..4131dae --- /dev/null +++ b/jobs/webscraper.go @@ -0,0 +1,30 @@ +package jobs + +// WebScraperType represents the job type for web scraping +const WebScraperType = "webscraper" + +// WebScraperConfiguration defines configuration for web scraping +type WebScraperConfiguration struct { + Blacklist []string `json:"blacklist"` +} + +// WebScraperArgs defines arguments for web scraping jobs +type WebScraperArgs struct { + URL string `json:"url"` + Selector string `json:"selector"` + MaxDepth int `json:"max_depth"` +} + +// Section represents a selected section of a web page +type Section struct { + Text string `json:"text"` + HTML string `json:"html"` + Selector string `json:"selector"` +} + +// CollectedData represents data collected from web scraping +type CollectedData struct { + URL string `json:"url"` + Title string `json:"title"` + Sections []Section `json:"sections"` +} diff --git a/types/crypto.go b/types/crypto.go new file mode 100644 index 0000000..e60d872 --- /dev/null +++ b/types/crypto.go @@ -0,0 +1,13 @@ +package types + +// EncryptedRequest represents an encrypted request +type EncryptedRequest struct { + EncryptedResult string `json:"encrypted_result"` + EncryptedRequest string `json:"encrypted_request"` +} + +// Key represents a cryptographic key +type Key struct { + ID string `json:"id"` + Data []byte `json:"data,omitempty"` +} diff --git a/types/job.go b/types/job.go new file mode 100644 index 0000000..934986e --- /dev/null +++ b/types/job.go @@ -0,0 +1,38 @@ +// Package types contains the minimal shared type definitions for tee-worker and tee-indexer +package types + +// No imports needed + +// JobArguments represents arguments passed to a job +type JobArguments map[string]interface{} + +// Job represents a task to be executed by a worker +type Job struct { + Type string `json:"type"` + Arguments JobArguments `json:"arguments"` + UUID string `json:"-"` + Nonce string `json:"quote"` + WorkerID string `json:"worker_id"` +} + +// JobResult represents the result of executing a job +type JobResult struct { + Error string `json:"error"` + Data []byte `json:"data"` + Job Job `json:"job"` + NextCursor string `json:"next_cursor"` +} + +// JobRequest represents a request to execute a job +type JobRequest struct { + EncryptedJob string `json:"encrypted_job"` +} + +// Common job type constants +const ( + // WebScraperType represents the job type for web scraping + WebScraperType = "webscraper" + + // TwitterScraperType represents standard Twitter scraping jobs + TwitterScraperType = "twitter" +) From 965e75348e836ab204f29e893d05da36f2aac7d6 Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 00:55:55 -0400 Subject: [PATCH 002/136] add twitter search args --- .idea/vcs.xml | 4 +++- README.md | 39 +++++++++++++++++++++++++++++++++++++-- types/job.go | 13 ++++++++++++- types/twitter.go | 12 ++++++++++++ 4 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 types/twitter.go diff --git a/.idea/vcs.xml b/.idea/vcs.xml index d843f34..94a25f7 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -1,4 +1,6 @@ - + + + \ No newline at end of file diff --git a/README.md b/README.md index eb0982d..0cce332 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,45 @@ import "github.com/masa-finance/tee-types/types" ### Core Types (`types/job.go`) - `Job`: Represents a task to be executed by a worker + ```go + type Job struct { + Type string `json:"type"` + Arguments JobArguments `json:"arguments"` + UUID string `json:"-"` + Nonce string `json:"quote"` + WorkerID string `json:"worker_id"` + } + ``` + +- `JobArguments`: Map type for job arguments with unmarshal utility + ```go + type JobArguments map[string]interface{} + ``` + - `JobResult`: Represents the result of executing a job -- `JobArguments`: Map type for job arguments - `JobRequest`: Represents a request to execute a job -- Job type constants for common job types (Web, Twitter) +- Common job type constants (Web, Twitter) + +### Twitter Types (`types/twitter.go`) + +- `TwitterSearchParams`: Parameters for Twitter searches + ```go + type TwitterSearchParams struct { + ScraperType string `json:"type"` // Type of search + TwitterSearchArguments `json:"arguments"` // Search arguments + } + ``` + +- `TwitterSearchArguments`: Arguments for Twitter searches + ```go + type TwitterSearchArguments struct { + Query string `json:"query"` // Username or search query + QueryType string `json:"type"` // Optional, type of search + StartTime string `json:"start_time"` // Optional ISO timestamp + EndTime string `json:"end_time"` // Optional ISO timestamp + MaxResults int `json:"max_results"` // Optional, max number of results + } + ``` ### Cryptographic Types (`types/crypto.go`) diff --git a/types/job.go b/types/job.go index 934986e..0f54fb0 100644 --- a/types/job.go +++ b/types/job.go @@ -1,11 +1,22 @@ // Package types contains the minimal shared type definitions for tee-worker and tee-indexer package types -// No imports needed +import ( + "encoding/json" +) // JobArguments represents arguments passed to a job type JobArguments map[string]interface{} +// Unmarshal unmarshals job arguments into the supplied interface +func (ja JobArguments) Unmarshal(i interface{}) error { + dat, err := json.Marshal(ja) + if err != nil { + return err + } + return json.Unmarshal(dat, i) +} + // Job represents a task to be executed by a worker type Job struct { Type string `json:"type"` diff --git a/types/twitter.go b/types/twitter.go new file mode 100644 index 0000000..94c16d6 --- /dev/null +++ b/types/twitter.go @@ -0,0 +1,12 @@ +// Package types provides shared types between tee-worker and tee-indexer +package types + +// TwitterSearchArguments defines arguments for Twitter searches +type TwitterSearchArguments struct { + Query string `json:"query"` // Username or search query + QueryType string `json:"type"` // Optional, type of search + StartTime string `json:"start_time"` // Optional ISO timestamp + EndTime string `json:"end_time"` // Optional ISO timestamp + MaxResults int `json:"max_results"` // Optional, max number of results + NextCursor string `json:"next_cursor"` // Optional, cursor for pagination +} From c2b4364b7505ba67c85b2eecca392bcbf418a29a Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:02:22 -0400 Subject: [PATCH 003/136] add twitter search fields --- types/twitter.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/types/twitter.go b/types/twitter.go index 94c16d6..8eb84f8 100644 --- a/types/twitter.go +++ b/types/twitter.go @@ -3,10 +3,11 @@ package types // TwitterSearchArguments defines arguments for Twitter searches type TwitterSearchArguments struct { - Query string `json:"query"` // Username or search query - QueryType string `json:"type"` // Optional, type of search + QueryType string `json:"type"` // Optional, type of search + Query string `json:"query"` // Username or search query + Count int `json:"count"` StartTime string `json:"start_time"` // Optional ISO timestamp EndTime string `json:"end_time"` // Optional ISO timestamp MaxResults int `json:"max_results"` // Optional, max number of results - NextCursor string `json:"next_cursor"` // Optional, cursor for pagination + NextCursor string `json:"next_cursor"` } From 04223dd2962fe5a42f05501c3c2d94c8abe27aec Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:09:02 -0400 Subject: [PATCH 004/136] add web args --- job/web.go | 16 ---------------- types/web.go | 7 +++++++ 2 files changed, 7 insertions(+), 16 deletions(-) delete mode 100644 job/web.go create mode 100644 types/web.go diff --git a/job/web.go b/job/web.go deleted file mode 100644 index 6fd1445..0000000 --- a/job/web.go +++ /dev/null @@ -1,16 +0,0 @@ -package job - -// WebScraperType represents the job type for web scraping -const WebScraperType = "webscraper" - -// WebScraperConfiguration defines configuration for web scraping -type WebScraperConfiguration struct { - Blacklist []string `json:"blacklist"` -} - -// WebScraperArgs defines arguments for web scraping jobs -type WebScraperArgs struct { - URL string `json:"url"` - Selector string `json:"selector"` - MaxDepth int `json:"max_depth"` -} diff --git a/types/web.go b/types/web.go new file mode 100644 index 0000000..cc4a1e7 --- /dev/null +++ b/types/web.go @@ -0,0 +1,7 @@ +package types + +type WebSearchArguments struct { + URL string `json:"url"` + Selector string `json:"selector"` + MaxDepth int `json:"max_depth"` +} From 51a9969f19cc1cdd4d1b2a95433e7d0a1d619eed Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:12:17 -0400 Subject: [PATCH 005/136] add depth on web --- jobs/webscraper.go | 1 + types/web.go | 1 + 2 files changed, 2 insertions(+) diff --git a/jobs/webscraper.go b/jobs/webscraper.go index 4131dae..7c5029d 100644 --- a/jobs/webscraper.go +++ b/jobs/webscraper.go @@ -12,6 +12,7 @@ type WebScraperConfiguration struct { type WebScraperArgs struct { URL string `json:"url"` Selector string `json:"selector"` + Depth int `json:"depth"` MaxDepth int `json:"max_depth"` } diff --git a/types/web.go b/types/web.go index cc4a1e7..8015302 100644 --- a/types/web.go +++ b/types/web.go @@ -3,5 +3,6 @@ package types type WebSearchArguments struct { URL string `json:"url"` Selector string `json:"selector"` + Depth int `json:"depth"` MaxDepth int `json:"max_depth"` } From e6b98f2fef3359d2cfec4596f5eb625ee80ba2de Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:23:43 -0400 Subject: [PATCH 006/136] add twitter result structs --- README.md | 56 +------------------------------------- types/twitter.go | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 0cce332..df7476c 100644 --- a/README.md +++ b/README.md @@ -24,58 +24,4 @@ Then import the required packages: ```go import "github.com/masa-finance/tee-types/types" -``` - -## Types Included - -### Core Types (`types/job.go`) - -- `Job`: Represents a task to be executed by a worker - ```go - type Job struct { - Type string `json:"type"` - Arguments JobArguments `json:"arguments"` - UUID string `json:"-"` - Nonce string `json:"quote"` - WorkerID string `json:"worker_id"` - } - ``` - -- `JobArguments`: Map type for job arguments with unmarshal utility - ```go - type JobArguments map[string]interface{} - ``` - -- `JobResult`: Represents the result of executing a job -- `JobRequest`: Represents a request to execute a job -- Common job type constants (Web, Twitter) - -### Twitter Types (`types/twitter.go`) - -- `TwitterSearchParams`: Parameters for Twitter searches - ```go - type TwitterSearchParams struct { - ScraperType string `json:"type"` // Type of search - TwitterSearchArguments `json:"arguments"` // Search arguments - } - ``` - -- `TwitterSearchArguments`: Arguments for Twitter searches - ```go - type TwitterSearchArguments struct { - Query string `json:"query"` // Username or search query - QueryType string `json:"type"` // Optional, type of search - StartTime string `json:"start_time"` // Optional ISO timestamp - EndTime string `json:"end_time"` // Optional ISO timestamp - MaxResults int `json:"max_results"` // Optional, max number of results - } - ``` - -### Cryptographic Types (`types/crypto.go`) - -- `EncryptedRequest`: For secure job requests -- `Key`: Basic key representation - -## Contributing - -When adding new features to tee-worker or tee-indexer, consider whether the types should be added to this shared package first. \ No newline at end of file +``` \ No newline at end of file diff --git a/types/twitter.go b/types/twitter.go index 8eb84f8..4569b49 100644 --- a/types/twitter.go +++ b/types/twitter.go @@ -1,6 +1,8 @@ // Package types provides shared types between tee-worker and tee-indexer package types +import "time" + // TwitterSearchArguments defines arguments for Twitter searches type TwitterSearchArguments struct { QueryType string `json:"type"` // Optional, type of search @@ -11,3 +13,72 @@ type TwitterSearchArguments struct { MaxResults int `json:"max_results"` // Optional, max number of results NextCursor string `json:"next_cursor"` } + +type TweetResult struct { + ID int64 `json:"id"` + TweetID string + ConversationID string + UserID string + Text string + CreatedAt time.Time + Timestamp int64 + + ThreadCursor struct { + FocalTweetID string + ThreadID string + Cursor string + CursorType string + } + IsQuoted bool + IsPin bool + IsReply bool + IsRetweet bool + IsSelfThread bool + Likes int + Hashtags []string + HTML string + Replies int + Retweets int + URLs []string + Username string + + Photos []Photo + + // Video type. + Videos []Video + + RetweetedStatusID string + Views int + SensitiveContent bool + + // from twitterx + AuthorID string + PublicMetrics PublicMetrics + PossiblySensitive bool + Lang string + NewestID string + OldestID string + ResultCount int + + Error error +} + +type PublicMetrics struct { + RetweetCount int + ReplyCount int + LikeCount int + QuoteCount int + BookmarkCount int + ImpressionCount int +} +type Photo struct { + ID string + URL string +} + +type Video struct { + ID string + Preview string + URL string + HLSURL string +} From ba52febcc5660e9c7d1a10c0b703f25b1421e0f3 Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:28:37 -0400 Subject: [PATCH 007/136] add args and types --- api/types/encrypted.go | 12 ---- api/types/job.go | 101 ------------------------------- api/types/key.go | 13 ---- args/twitter.go | 13 ++++ args/web.go | 8 +++ crypto/keyring.go | 60 ------------------ job/telemetry.go | 14 ----- job/twitter.go | 51 ---------------- job/types.go | 134 ----------------------------------------- jobs/telemetry.go | 14 ----- jobs/twitter.go | 121 ------------------------------------- jobs/webscraper.go | 31 ---------- types/crypto.go | 13 ---- types/job.go | 49 --------------- types/twitter.go | 11 ---- 15 files changed, 21 insertions(+), 624 deletions(-) delete mode 100644 api/types/encrypted.go delete mode 100644 api/types/job.go delete mode 100644 api/types/key.go create mode 100644 args/twitter.go create mode 100644 args/web.go delete mode 100644 crypto/keyring.go delete mode 100644 job/telemetry.go delete mode 100644 job/twitter.go delete mode 100644 job/types.go delete mode 100644 jobs/telemetry.go delete mode 100644 jobs/twitter.go delete mode 100644 jobs/webscraper.go delete mode 100644 types/crypto.go delete mode 100644 types/job.go diff --git a/api/types/encrypted.go b/api/types/encrypted.go deleted file mode 100644 index 28dfe71..0000000 --- a/api/types/encrypted.go +++ /dev/null @@ -1,12 +0,0 @@ -package types - -// EncryptedRequest represents an encrypted request and result -type EncryptedRequest struct { - EncryptedResult string `json:"encrypted_result"` - EncryptedRequest string `json:"encrypted_request"` -} - -// JobError represents an error response for a job -type JobError struct { - Error string `json:"error"` -} diff --git a/api/types/job.go b/api/types/job.go deleted file mode 100644 index d98ece9..0000000 --- a/api/types/job.go +++ /dev/null @@ -1,101 +0,0 @@ -// Package types defines common type definitions used across tee services -package types - -import ( - "crypto/sha256" - "encoding/json" - "fmt" - "golang.org/x/exp/rand" -) - -// JobArguments represents arguments passed to a job -type JobArguments map[string]interface{} - -// Unmarshal unmarshals job arguments into the supplied interface -func (ja JobArguments) Unmarshal(i interface{}) error { - dat, err := json.Marshal(ja) - if err != nil { - return err - } - return json.Unmarshal(dat, i) -} - -// Job represents a task to be executed by a worker -type Job struct { - Type string `json:"type"` - Arguments JobArguments `json:"arguments"` - UUID string `json:"-"` - Nonce string `json:"quote"` - WorkerID string `json:"worker_id"` -} - -var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") - -func randStringRunes(n int) string { - b := make([]rune, n) - for i := range b { - b[i] = letterRunes[rand.Intn(len(letterRunes))] - } - return string(b) -} - -// GenerateJobSignature generates a signature for the job. -// Note: This method will need to be adjusted when used in actual implementations -// to use the appropriate sealing mechanism. -func (job *Job) GenerateJobSignature() (string, error) { - dat, err := json.Marshal(job) - if err != nil { - return "", err - } - - checksum := sha256.New() - checksum.Write(dat) - - job.Nonce = fmt.Sprintf("%s-%s", string(checksum.Sum(nil)), randStringRunes(99)) - - return job.Nonce, nil -} - -// JobResponse represents a response to a job submission -type JobResponse struct { - UID string `json:"uid"` -} - -// JobResult represents the result of executing a job -type JobResult struct { - Error string `json:"error"` - Data []byte `json:"data"` - Job Job `json:"job"` - NextCursor string `json:"next_cursor"` -} - -// Success returns true if the job was successful. -func (jr JobResult) Success() bool { - return jr.Error == "" -} - -// Unmarshal unmarshals the job result data. -func (jr JobResult) Unmarshal(i interface{}) error { - return json.Unmarshal(jr.Data, i) -} - -// JobRequest represents a request to execute a job -type JobRequest struct { - EncryptedJob string `json:"encrypted_job"` -} - -// JobConfiguration represents configuration for a job -type JobConfiguration map[string]interface{} - -// Unmarshal unmarshals the job configuration into the supplied interface. -func (jc JobConfiguration) Unmarshal(v interface{}) error { - data, err := json.Marshal(jc) - if err != nil { - return fmt.Errorf("error marshalling job configuration: %w", err) - } - if err := json.Unmarshal(data, v); err != nil { - return fmt.Errorf("error unmarshalling job configuration: %w", err) - } - - return nil -} diff --git a/api/types/key.go b/api/types/key.go deleted file mode 100644 index 3a9616e..0000000 --- a/api/types/key.go +++ /dev/null @@ -1,13 +0,0 @@ -package types - -// Key represents an encryption key -type Key struct { - Key string `json:"key"` - ID string `json:"id"` -} - -// KeyResponse represents a response when requesting a key -type KeyResponse struct { - Key string `json:"key"` - ID string `json:"id"` -} diff --git a/args/twitter.go b/args/twitter.go new file mode 100644 index 0000000..09ceb3c --- /dev/null +++ b/args/twitter.go @@ -0,0 +1,13 @@ +// Package types provides shared types between tee-worker and tee-indexer +package types + +// TwitterSearchArguments defines args for Twitter searches +type TwitterSearchArguments struct { + QueryType string `json:"type"` // Optional, type of search + Query string `json:"query"` // Username or search query + Count int `json:"count"` + StartTime string `json:"start_time"` // Optional ISO timestamp + EndTime string `json:"end_time"` // Optional ISO timestamp + MaxResults int `json:"max_results"` // Optional, max number of results + NextCursor string `json:"next_cursor"` +} diff --git a/args/web.go b/args/web.go new file mode 100644 index 0000000..8015302 --- /dev/null +++ b/args/web.go @@ -0,0 +1,8 @@ +package types + +type WebSearchArguments struct { + URL string `json:"url"` + Selector string `json:"selector"` + Depth int `json:"depth"` + MaxDepth int `json:"max_depth"` +} diff --git a/crypto/keyring.go b/crypto/keyring.go deleted file mode 100644 index a698a6c..0000000 --- a/crypto/keyring.go +++ /dev/null @@ -1,60 +0,0 @@ -// Package crypto contains cryptographic interfaces and helpers for secure operations -package crypto - -// KeyType defines the type of a key in the keyring -type KeyType string - -const ( - // SealingKeyType is used for sealing operations - SealingKeyType KeyType = "sealing" - - // EncryptionKeyType is used for encryption operations - EncryptionKeyType KeyType = "encryption" - - // SigningKeyType is used for signing operations - SigningKeyType KeyType = "signing" -) - -// Key represents a cryptographic key in the keyring -type Key struct { - ID string `json:"id"` - Type KeyType `json:"type"` - Data []byte `json:"data"` -} - -// KeyRing defines the interface for key management -// This follows the modern approach of using a KeyRing instead of single keys -type KeyRing interface { - // AddKey adds a key to the keyring - AddKey(key Key) error - - // GetKey retrieves a key from the keyring by ID - GetKey(id string) (Key, error) - - // GetKeysByType retrieves all keys of a specific type - GetKeysByType(keyType KeyType) ([]Key, error) - - // RemoveKey removes a key from the keyring by ID - RemoveKey(id string) error - - // Seal seals data using the keyring - Seal(data []byte) ([]byte, error) - - // Unseal unseals data using the keyring - Unseal(data []byte) ([]byte, error) - - // SealWithKey seals data using a specific key from the keyring - SealWithKey(keyID string, data []byte) ([]byte, error) - - // UnsealWithKey unseals data using a specific key from the keyring - UnsealWithKey(keyID string, data []byte) ([]byte, error) -} - -// TEEOptions defines options for TEE operations -type TEEOptions struct { - // IsTestEnvironment indicates if this is a test environment - IsTestEnvironment bool - - // KeysDirectory specifies the directory for storing keys - KeysDirectory string -} diff --git a/job/telemetry.go b/job/telemetry.go deleted file mode 100644 index f2fc5e7..0000000 --- a/job/telemetry.go +++ /dev/null @@ -1,14 +0,0 @@ -package job - -// TelemetryJobType represents the job type for telemetry operations -const TelemetryJobType = "telemetry" - -// TelemetryConfiguration defines configuration for telemetry jobs -type TelemetryConfiguration struct { - StatsInterval int `json:"stats_interval"` -} - -// TelemetryResult represents the result of a telemetry operation -type TelemetryResult struct { - Stats map[string]uint `json:"stats"` -} diff --git a/job/twitter.go b/job/twitter.go deleted file mode 100644 index ca0d5be..0000000 --- a/job/twitter.go +++ /dev/null @@ -1,51 +0,0 @@ -package job - -// TwitterJobTypes defines the various types of Twitter jobs -const ( - // TwitterScraperType represents standard Twitter scraping jobs - TwitterScraperType = "twitter" - - // TwitterCredentialScraperType represents Twitter scraping jobs using credentials - TwitterCredentialScraperType = "twitter-credential" - - // TwitterApiScraperType represents Twitter scraping jobs using API keys - TwitterApiScraperType = "twitter-api" -) - -// TwitterKeyAuthType defines the type of authentication for Twitter API keys -type TwitterKeyAuthType string - -// TwitterKeyAuthType constants -const ( - CredentialAuthType TwitterKeyAuthType = "credential" - ApiKeyAuthType TwitterKeyAuthType = "apikey" - UnknownAuthType TwitterKeyAuthType = "unknown" -) - -// TwitterApiKeyType defines the type of Twitter API key -type TwitterApiKeyType string - -// TwitterApiKeyType constants -const ( - TwitterApiKeyTypeBase TwitterApiKeyType = "base" - TwitterApiKeyTypeElevated TwitterApiKeyType = "elevated" - TwitterApiKeyTypeCredential TwitterApiKeyType = "credential" - TwitterApiKeyTypeUnknown TwitterApiKeyType = "unknown" -) - -// TwitterScraperConfiguration defines configuration for Twitter scraping -type TwitterScraperConfiguration struct { - Accounts []string `json:"twitter_accounts"` - ApiKeys []string `json:"twitter_api_keys"` - DataDir string `json:"data_dir"` - SkipLoginVerification bool `json:"skip_login_verification,omitempty"` // If true, skips Twitter's verify_credentials check -} - -// TwitterScraperArgs defines arguments for Twitter scraping jobs -type TwitterScraperArgs struct { - SearchType string `json:"type"` - Query string `json:"query"` - Count int `json:"count"` - MaxResults int `json:"max_results"` - NextCursor string `json:"next_cursor"` -} diff --git a/job/types.go b/job/types.go deleted file mode 100644 index 09c4b60..0000000 --- a/job/types.go +++ /dev/null @@ -1,134 +0,0 @@ -// Package job contains the core job types shared across TEE services -package job - -import ( - "crypto/sha256" - "encoding/json" - "fmt" - "golang.org/x/exp/rand" -) - -// Version information -const ( - VersionMajor = 0 - VersionMinor = 1 - VersionPatch = 0 -) - -// GetVersion returns the semantic version string -func GetVersion() string { - return fmt.Sprintf("%d.%d.%d", VersionMajor, VersionMinor, VersionPatch) -} - -// Arguments represents arguments passed to a job -type Arguments map[string]interface{} - -// Unmarshal unmarshals job arguments into the supplied interface -func (ja Arguments) Unmarshal(i interface{}) error { - dat, err := json.Marshal(ja) - if err != nil { - return err - } - return json.Unmarshal(dat, i) -} - -// Job represents a task to be executed by a worker -type Job struct { - Type string `json:"type"` - Arguments Arguments `json:"arguments"` - UUID string `json:"-"` - Nonce string `json:"quote"` - WorkerID string `json:"worker_id"` -} - -var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") - -func randStringRunes(n int) string { - b := make([]rune, n) - for i := range b { - b[i] = letterRunes[rand.Intn(len(letterRunes))] - } - return string(b) -} - -// GenerateJobSignature generates a signature for the job. -// Note: This method is a placeholder. Each service will need to implement -// its own version with the appropriate sealing mechanism. -func (job *Job) GenerateJobSignature() (string, error) { - dat, err := json.Marshal(job) - if err != nil { - return "", err - } - - checksum := sha256.New() - checksum.Write(dat) - - job.Nonce = fmt.Sprintf("%s-%s", string(checksum.Sum(nil)), randStringRunes(99)) - - return job.Nonce, nil -} - -// Response represents a response to a job submission -type Response struct { - UID string `json:"uid"` -} - -// Result represents the result of executing a job -type Result struct { - Error string `json:"error"` - Data []byte `json:"data"` - Job Job `json:"job"` - NextCursor string `json:"next_cursor"` -} - -// Success returns true if the job was successful. -func (jr Result) Success() bool { - return jr.Error == "" -} - -// Unmarshal unmarshals the job result data. -func (jr Result) Unmarshal(i interface{}) error { - return json.Unmarshal(jr.Data, i) -} - -// Request represents a request to execute a job -type Request struct { - EncryptedJob string `json:"encrypted_job"` -} - -// Configuration represents configuration for a job -type Configuration map[string]interface{} - -// Unmarshal unmarshals the job configuration into the supplied interface. -func (jc Configuration) Unmarshal(v interface{}) error { - data, err := json.Marshal(jc) - if err != nil { - return fmt.Errorf("error marshalling job configuration: %w", err) - } - if err := json.Unmarshal(data, v); err != nil { - return fmt.Errorf("error unmarshalling job configuration: %w", err) - } - - return nil -} - -// Parameters defines the base interface for job parameters -type Parameters interface { - // GetIdentifier returns a unique identifier for the job parameters - GetIdentifier() string -} - -// Status defines the base interface for job status -type Status interface { - // GetError returns the job error - GetError() string - - // SetError sets the job error - SetError(err string) - - // GetStatus returns the job status - GetStatus() string - - // SetStatus sets the job status - SetStatus(status string) -} diff --git a/jobs/telemetry.go b/jobs/telemetry.go deleted file mode 100644 index 55c249c..0000000 --- a/jobs/telemetry.go +++ /dev/null @@ -1,14 +0,0 @@ -package jobs - -// TelemetryJobType represents the job type for telemetry operations -const TelemetryJobType = "telemetry" - -// TelemetryConfiguration defines configuration for telemetry jobs -type TelemetryConfiguration struct { - StatsInterval int `json:"stats_interval"` -} - -// TelemetryResult represents the result of a telemetry operation -type TelemetryResult struct { - Stats map[string]uint `json:"stats"` -} diff --git a/jobs/twitter.go b/jobs/twitter.go deleted file mode 100644 index 102f999..0000000 --- a/jobs/twitter.go +++ /dev/null @@ -1,121 +0,0 @@ -package jobs - -import ( - "time" -) - -// TwitterJobTypes defines the various types of Twitter jobs -const ( - // TwitterScraperType represents standard Twitter scraping jobs - TwitterScraperType = "twitter" - - // TwitterCredentialScraperType represents Twitter scraping jobs using credentials - TwitterCredentialScraperType = "twitter-credential" - - // TwitterApiScraperType represents Twitter scraping jobs using API keys - TwitterApiScraperType = "twitter-api" -) - -// TwitterKeyAuthType defines the type of authentication for Twitter API keys -type TwitterKeyAuthType string - -// TwitterKeyAuthType constants -const ( - CredentialAuthType TwitterKeyAuthType = "credential" - ApiKeyAuthType TwitterKeyAuthType = "apikey" - UnknownAuthType TwitterKeyAuthType = "unknown" -) - -// TwitterApiKeyType defines the type of Twitter API key -type TwitterApiKeyType string - -// TwitterApiKeyType constants -const ( - TwitterApiKeyTypeBase TwitterApiKeyType = "base" - TwitterApiKeyTypeElevated TwitterApiKeyType = "elevated" - TwitterApiKeyTypeCredential TwitterApiKeyType = "credential" - TwitterApiKeyTypeUnknown TwitterApiKeyType = "unknown" -) - -// TwitterScraperConfiguration defines configuration for Twitter scraping -type TwitterScraperConfiguration struct { - Accounts []string `json:"twitter_accounts"` - ApiKeys []string `json:"twitter_api_keys"` - DataDir string `json:"data_dir"` - SkipLoginVerification bool `json:"skip_login_verification,omitempty"` // If true, skips Twitter's verify_credentials check -} - -// TwitterScraperArgs defines arguments for Twitter scraping jobs -type TwitterScraperArgs struct { - SearchType string `json:"type"` - Query string `json:"query"` - Count int `json:"count"` - MaxResults int `json:"max_results"` - NextCursor string `json:"next_cursor"` -} - -// TweetResult represents a Tweet returned from Twitter -type TweetResult struct { - ID int64 `json:"id"` - TweetID string `json:"tweet_id"` - ConversationID string `json:"conversation_id"` - UserID string `json:"user_id"` - Text string `json:"text"` - CreatedAt time.Time `json:"created_at"` - Timestamp int64 `json:"timestamp"` - - IsQuoted bool `json:"is_quoted"` - IsPin bool `json:"is_pin"` - IsReply bool `json:"is_reply"` - IsRetweet bool `json:"is_retweet"` - IsSelfThread bool `json:"is_self_thread"` - Likes int `json:"likes"` - Hashtags []string `json:"hashtags"` - HTML string `json:"html"` - Replies int `json:"replies"` - Retweets int `json:"retweets"` - URLs []string `json:"urls"` - Username string `json:"username"` - - Photos []Photo `json:"photos"` - Videos []Video `json:"videos"` - - RetweetedStatusID string `json:"retweeted_status_id"` - Views int `json:"views"` - SensitiveContent bool `json:"sensitive_content"` - - // Fields from TwitterX API - AuthorID string `json:"author_id"` - PublicMetrics PublicMetrics `json:"public_metrics"` - PossiblySensitive bool `json:"possibly_sensitive"` - Lang string `json:"lang"` - NewestID string `json:"newest_id"` - OldestID string `json:"oldest_id"` - ResultCount int `json:"result_count"` - - Error error `json:"-"` -} - -// PublicMetrics represents public metrics for a Tweet -type PublicMetrics struct { - RetweetCount int `json:"retweet_count"` - ReplyCount int `json:"reply_count"` - LikeCount int `json:"like_count"` - QuoteCount int `json:"quote_count"` - BookmarkCount int `json:"bookmark_count"` - ImpressionCount int `json:"impression_count"` -} - -// Photo represents an image attached to a Tweet -type Photo struct { - ID string `json:"id"` - URL string `json:"url"` -} - -// Video represents a video attached to a Tweet -type Video struct { - ID string `json:"id"` - Preview string `json:"preview"` - URL string `json:"url"` - HLSURL string `json:"hls_url"` -} diff --git a/jobs/webscraper.go b/jobs/webscraper.go deleted file mode 100644 index 7c5029d..0000000 --- a/jobs/webscraper.go +++ /dev/null @@ -1,31 +0,0 @@ -package jobs - -// WebScraperType represents the job type for web scraping -const WebScraperType = "webscraper" - -// WebScraperConfiguration defines configuration for web scraping -type WebScraperConfiguration struct { - Blacklist []string `json:"blacklist"` -} - -// WebScraperArgs defines arguments for web scraping jobs -type WebScraperArgs struct { - URL string `json:"url"` - Selector string `json:"selector"` - Depth int `json:"depth"` - MaxDepth int `json:"max_depth"` -} - -// Section represents a selected section of a web page -type Section struct { - Text string `json:"text"` - HTML string `json:"html"` - Selector string `json:"selector"` -} - -// CollectedData represents data collected from web scraping -type CollectedData struct { - URL string `json:"url"` - Title string `json:"title"` - Sections []Section `json:"sections"` -} diff --git a/types/crypto.go b/types/crypto.go deleted file mode 100644 index e60d872..0000000 --- a/types/crypto.go +++ /dev/null @@ -1,13 +0,0 @@ -package types - -// EncryptedRequest represents an encrypted request -type EncryptedRequest struct { - EncryptedResult string `json:"encrypted_result"` - EncryptedRequest string `json:"encrypted_request"` -} - -// Key represents a cryptographic key -type Key struct { - ID string `json:"id"` - Data []byte `json:"data,omitempty"` -} diff --git a/types/job.go b/types/job.go deleted file mode 100644 index 0f54fb0..0000000 --- a/types/job.go +++ /dev/null @@ -1,49 +0,0 @@ -// Package types contains the minimal shared type definitions for tee-worker and tee-indexer -package types - -import ( - "encoding/json" -) - -// JobArguments represents arguments passed to a job -type JobArguments map[string]interface{} - -// Unmarshal unmarshals job arguments into the supplied interface -func (ja JobArguments) Unmarshal(i interface{}) error { - dat, err := json.Marshal(ja) - if err != nil { - return err - } - return json.Unmarshal(dat, i) -} - -// Job represents a task to be executed by a worker -type Job struct { - Type string `json:"type"` - Arguments JobArguments `json:"arguments"` - UUID string `json:"-"` - Nonce string `json:"quote"` - WorkerID string `json:"worker_id"` -} - -// JobResult represents the result of executing a job -type JobResult struct { - Error string `json:"error"` - Data []byte `json:"data"` - Job Job `json:"job"` - NextCursor string `json:"next_cursor"` -} - -// JobRequest represents a request to execute a job -type JobRequest struct { - EncryptedJob string `json:"encrypted_job"` -} - -// Common job type constants -const ( - // WebScraperType represents the job type for web scraping - WebScraperType = "webscraper" - - // TwitterScraperType represents standard Twitter scraping jobs - TwitterScraperType = "twitter" -) diff --git a/types/twitter.go b/types/twitter.go index 4569b49..9e8c563 100644 --- a/types/twitter.go +++ b/types/twitter.go @@ -3,17 +3,6 @@ package types import "time" -// TwitterSearchArguments defines arguments for Twitter searches -type TwitterSearchArguments struct { - QueryType string `json:"type"` // Optional, type of search - Query string `json:"query"` // Username or search query - Count int `json:"count"` - StartTime string `json:"start_time"` // Optional ISO timestamp - EndTime string `json:"end_time"` // Optional ISO timestamp - MaxResults int `json:"max_results"` // Optional, max number of results - NextCursor string `json:"next_cursor"` -} - type TweetResult struct { ID int64 `json:"id"` TweetID string From 74154c910a5ca4ef716288376746f674027a9e46 Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:29:42 -0400 Subject: [PATCH 008/136] add args and types --- args/twitter.go | 3 +-- args/web.go | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/args/twitter.go b/args/twitter.go index 09ceb3c..7fc4a9b 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -1,5 +1,4 @@ -// Package types provides shared types between tee-worker and tee-indexer -package types +package args // TwitterSearchArguments defines args for Twitter searches type TwitterSearchArguments struct { diff --git a/args/web.go b/args/web.go index 8015302..66e2093 100644 --- a/args/web.go +++ b/args/web.go @@ -1,4 +1,4 @@ -package types +package args type WebSearchArguments struct { URL string `json:"url"` From 2f2c09d27133c4ef54252857901152cfa8ebdc41 Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Wed, 14 May 2025 01:51:53 -0400 Subject: [PATCH 009/136] revert to use 1.23.0 go --- go.mod | 4 +--- go.sum | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 4cb1723..553d075 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module github.com/masa-finance/tee-types -go 1.24.0 - -require golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 +go 1.23.0 diff --git a/go.sum b/go.sum index 2fbef45..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +0,0 @@ -golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 h1:y5zboxd6LQAqYIhHnB48p0ByQ/GnQx2BE33L8BOHQkI= -golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6/go.mod h1:U6Lno4MTRCDY+Ba7aCcauB9T60gsv5s4ralQzP72ZoQ= From 5b8af7db591cc7740b5333beef7747f08eae960b Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Fri, 16 May 2025 10:17:03 -0400 Subject: [PATCH 010/136] remove the web types since we already have the web search args on the args --- types/web.go | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 types/web.go diff --git a/types/web.go b/types/web.go deleted file mode 100644 index 8015302..0000000 --- a/types/web.go +++ /dev/null @@ -1,8 +0,0 @@ -package types - -type WebSearchArguments struct { - URL string `json:"url"` - Selector string `json:"selector"` - Depth int `json:"depth"` - MaxDepth int `json:"max_depth"` -} From 2cc36907eb91dbfc3de4117cad39f048759ac05e Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Fri, 6 Jun 2025 12:56:12 -0400 Subject: [PATCH 011/136] add tiktok types --- args/tiktok.go | 7 +++++++ types/tiktok.go | 11 +++++++++++ 2 files changed, 18 insertions(+) create mode 100644 args/tiktok.go create mode 100644 types/tiktok.go diff --git a/args/tiktok.go b/args/tiktok.go new file mode 100644 index 0000000..9a082d9 --- /dev/null +++ b/args/tiktok.go @@ -0,0 +1,7 @@ +package args + +// TikTokTranscriptionArguments defines args for TikTok transcriptions +type TikTokTranscriptionArguments struct { + VideoURL string `json:"video_url"` + Language string `json:"language,omitempty"` // e.g., "eng-US" +} \ No newline at end of file diff --git a/types/tiktok.go b/types/tiktok.go new file mode 100644 index 0000000..39ceac4 --- /dev/null +++ b/types/tiktok.go @@ -0,0 +1,11 @@ +// Package types provides shared types between tee-worker and tee-indexer +package types + +// TikTokTranscriptionResult defines the structure of the result data for a TikTok transcription +type TikTokTranscriptionResult struct { + TranscriptionText string `json:"transcription_text"` + DetectedLanguage string `json:"detected_language,omitempty"` + VideoTitle string `json:"video_title,omitempty"` + OriginalURL string `json:"original_url"` + ThumbnailURL string `json:"thumbnail_url,omitempty"` +} \ No newline at end of file From f54fb32b355c6ae11efd89f9e7e9336218678104 Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Sun, 22 Jun 2025 11:09:19 -0400 Subject: [PATCH 012/136] add linkedin types - Add LinkedInSearchArguments for search queries - Add LinkedInProfileResult for profile data - Support network filters and pagination --- args/linkedin.go | 10 ++++++++++ types/linkedin.go | 13 +++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 args/linkedin.go create mode 100644 types/linkedin.go diff --git a/args/linkedin.go b/args/linkedin.go new file mode 100644 index 0000000..436143d --- /dev/null +++ b/args/linkedin.go @@ -0,0 +1,10 @@ +package args + +// LinkedInSearchArguments defines args for LinkedIn searches +type LinkedInSearchArguments struct { + QueryType string `json:"type"` // "searchbyquery", "getprofile" + Query string `json:"query"` // Keywords for search or username for profile + NetworkFilters []string `json:"network_filters,omitempty"` // ["F", "S", "O"] - First, Second, Other + MaxResults int `json:"max_results"` // Maximum number of results to return + Start int `json:"start"` // Pagination start offset +} \ No newline at end of file diff --git a/types/linkedin.go b/types/linkedin.go new file mode 100644 index 0000000..e4d8e39 --- /dev/null +++ b/types/linkedin.go @@ -0,0 +1,13 @@ +// Package types provides shared types between tee-worker and tee-indexer +package types + +// LinkedInProfileResult defines the structure of a LinkedIn profile search result +type LinkedInProfileResult struct { + PublicIdentifier string `json:"public_identifier"` // Username/slug in profile URL + URN string `json:"urn"` // LinkedIn's unique resource name + FullName string `json:"full_name"` // Person's full name + Headline string `json:"headline"` // Professional headline/title + Location string `json:"location"` // Geographic location + ProfileURL string `json:"profile_url"` // Full LinkedIn profile URL + Degree string `json:"degree,omitempty"` // Connection degree (1st, 2nd, etc.) +} \ No newline at end of file From d1bd5096ee394b37b84f5ad865ad7ba03313ea96 Mon Sep 17 00:00:00 2001 From: Alvin Reyes Date: Sun, 22 Jun 2025 15:28:10 -0400 Subject: [PATCH 013/136] add linkedin --- args/linkedin.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/linkedin.go b/args/linkedin.go index 436143d..fc138d4 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -4,7 +4,7 @@ package args type LinkedInSearchArguments struct { QueryType string `json:"type"` // "searchbyquery", "getprofile" Query string `json:"query"` // Keywords for search or username for profile - NetworkFilters []string `json:"network_filters,omitempty"` // ["F", "S", "O"] - First, Second, Other + NetworkFilters []string `json:"network_filters,omitempty"` // ["F", "S", "O"] - First, Second, Other (default: all) MaxResults int `json:"max_results"` // Maximum number of results to return Start int `json:"start"` // Pagination start offset } \ No newline at end of file From 9b461816d7f3ba208ea042ba24a69d16d6deecd0 Mon Sep 17 00:00:00 2001 From: Brendan Playford <34052452+teslashibe@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:01:52 -0700 Subject: [PATCH 014/136] feat(args): extend and rename linkedin arguments for profile fetching (#2) - Rename LinkedInSearchArguments to LinkedInArguments for broader scope - Add PublicIdentifier field to support individual profile fetching - Maintain backward compatibility with deprecated type alias - Field uses omitempty tag to preserve existing search functionality - Enables dual-purpose struct for both search and profile operations --- .cursor/rules/collaboration-rules.mdc | 30 ++++++++++++++++ .cursor/rules/tee-types_update_plan.mdc | 48 +++++++++++++++++++++++++ args/linkedin.go | 21 ++++++----- types/linkedin.go | 41 ++++++++++++++++++++- 4 files changed, 131 insertions(+), 9 deletions(-) create mode 100644 .cursor/rules/collaboration-rules.mdc create mode 100644 .cursor/rules/tee-types_update_plan.mdc diff --git a/.cursor/rules/collaboration-rules.mdc b/.cursor/rules/collaboration-rules.mdc new file mode 100644 index 0000000..fbc46e1 --- /dev/null +++ b/.cursor/rules/collaboration-rules.mdc @@ -0,0 +1,30 @@ +--- +description: +globs: +alwaysApply: true +--- +# Collaboration Rules + +## Planning and Confirmation Rule + +**Before implementing any code changes, features, or modifications:** + +1. **Create a Plan**: Always develop a clear, detailed plan that outlines: + - What changes will be made + - Which files will be modified or created + - The approach and methodology + - Expected outcomes and impacts + +2. **Confirm with User**: Present the plan to the user and wait for explicit confirmation before: + - Making any file modifications + - Creating new files + - Running commands that modify the codebase + - Implementing any suggested changes + +3. **Get Approval**: Only proceed with implementation after receiving clear approval from the user. + +4. **No Assumptions**: Never assume the user wants changes implemented immediately, even if they seem obvious or beneficial. + +**Exception**: Read-only operations (viewing files, searching, analyzing) do not require prior confirmation. + +This rule ensures we maintain collaborative control over the codebase and prevents unwanted changes. diff --git a/.cursor/rules/tee-types_update_plan.mdc b/.cursor/rules/tee-types_update_plan.mdc new file mode 100644 index 0000000..741eea6 --- /dev/null +++ b/.cursor/rules/tee-types_update_plan.mdc @@ -0,0 +1,48 @@ +--- +description: +globs: +alwaysApply: false +--- +# tee-types: LinkedIn Data Structures Extension + +## Overview +This plan details the required changes for the `github.com/masa-finance/tee-types` repository. These changes are a prerequisite for integrating the new LinkedIn profile fetching functionality into the `tee-worker`. The goal is to extend the existing data structures to support both profile search and full profile fetching jobs. + +## ⚠️ CRITICAL REQUIREMENTS +- **BACKWARD COMPATIBILITY**: The changes must not break existing `tee-worker` functionality that relies on `searchbyquery`. +- **CONSISTENCY**: The new data structures should align with the output of the `linkedin-scraper` SDK (`v1.0.0`). +- **CLARITY**: Use clear and descriptive naming for new structs and fields. + +## Implementation Steps + +### Phase 1: Argument Structure Update + +#### Step 1.1: Extend and Rename Job Arguments +**Objective**: Create a unified argument struct that supports both search and profile fetching. +**Files**: `args/linkedin.go` +**Action**: +- Rename the existing `LinkedInSearchArguments` struct to `LinkedInArguments`. This provides a more generic name for future extensions. +- Add a new field `PublicIdentifier string `json:"public_identifier,omitempty"` to the renamed `LinkedInArguments` struct. This will be used to specify the target profile for fetching. +**Verification**: The new `LinkedInArguments` struct contains fields for both search (`Query`, `MaxResults`, etc.) and profile fetching (`PublicIdentifier`). +**Commit**: `feat(args): extend and rename linkedin arguments for profile fetching` + +### Phase 2: Result Structure Extension + +#### Step 2.1: Define Comprehensive Profile Result +**Objective**: Create a new struct to hold the rich data from a full profile fetch. +**Files**: `types/linkedin.go` +**Action**: +- Create a new struct `LinkedInFullProfileResult`. +- This struct should include fields for all the data provided by the scraper's `GetProfile` method, such as: + - `PublicIdentifier`, `URN`, `FullName`, `Headline`, `Location`, `Summary` + - Slices for `[]Experience`, `[]Education`, `[]Skill` + - `ProfilePictureURL` +- Define helper structs for `Experience`, `Education`, and `Skill` with relevant fields (e.g., `Title`, `CompanyName` for experience; `SchoolName`, `DegreeName` for education). +**Verification**: The `LinkedInFullProfileResult` and its nested structs are defined and compile correctly. The structure matches the expected output from the `linkedin-scraper`. +**Commit**: `feat(types): add LinkedInFullProfileResult for detailed profiles` + +## Success Criteria +- ✅ `args/linkedin.go` contains the updated `LinkedInArguments` struct. +- ✅ `types/linkedin.go` contains the new `LinkedInFullProfileResult` and its associated substructures. +- ✅ The changes are non-breaking for code that uses the old `LinkedInSearchArguments` (after a name update). +- ✅ The new structures are ready to be consumed by the `tee-worker`. diff --git a/args/linkedin.go b/args/linkedin.go index fc138d4..bd94dbd 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -1,10 +1,15 @@ package args -// LinkedInSearchArguments defines args for LinkedIn searches -type LinkedInSearchArguments struct { - QueryType string `json:"type"` // "searchbyquery", "getprofile" - Query string `json:"query"` // Keywords for search or username for profile - NetworkFilters []string `json:"network_filters,omitempty"` // ["F", "S", "O"] - First, Second, Other (default: all) - MaxResults int `json:"max_results"` // Maximum number of results to return - Start int `json:"start"` // Pagination start offset -} \ No newline at end of file +// LinkedInArguments defines args for LinkedIn operations +type LinkedInArguments struct { + QueryType string `json:"type"` // "searchbyquery", "getprofile" + Query string `json:"query"` // Keywords for search or username for profile + PublicIdentifier string `json:"public_identifier,omitempty"` + NetworkFilters []string `json:"network_filters,omitempty"` // ["F", "S", "O"] - First, Second, Other (default: all) + MaxResults int `json:"max_results"` // Maximum number of results to return + Start int `json:"start"` // Pagination start offset +} + +// LinkedInSearchArguments is an alias for LinkedInArguments for backward compatibility. +// Deprecated: use LinkedInArguments instead. +type LinkedInSearchArguments = LinkedInArguments diff --git a/types/linkedin.go b/types/linkedin.go index e4d8e39..b5c050a 100644 --- a/types/linkedin.go +++ b/types/linkedin.go @@ -10,4 +10,43 @@ type LinkedInProfileResult struct { Location string `json:"location"` // Geographic location ProfileURL string `json:"profile_url"` // Full LinkedIn profile URL Degree string `json:"degree,omitempty"` // Connection degree (1st, 2nd, etc.) -} \ No newline at end of file +} + +// Experience defines the structure for a single entry in a user's work experience +type Experience struct { + Title string `json:"title"` + CompanyName string `json:"company_name"` + Location string `json:"location,omitempty"` + StartDate string `json:"start_date,omitempty"` + EndDate string `json:"end_date,omitempty"` + Description string `json:"description,omitempty"` +} + +// Education defines the structure for a single entry in a user's education history +type Education struct { + SchoolName string `json:"school_name"` + DegreeName string `json:"degree_name,omitempty"` + FieldOfStudy string `json:"field_of_study,omitempty"` + StartDate string `json:"start_date,omitempty"` + EndDate string `json:"end_date,omitempty"` + Description string `json:"description,omitempty"` +} + +// Skill defines the structure for a single skill entry +type Skill struct { + Name string `json:"name"` +} + +// LinkedInFullProfileResult defines the structure for a detailed LinkedIn profile +type LinkedInFullProfileResult struct { + PublicIdentifier string `json:"public_identifier"` + URN string `json:"urn"` + FullName string `json:"full_name"` + Headline string `json:"headline"` + Location string `json:"location"` + Summary string `json:"summary,omitempty"` + ProfilePictureURL string `json:"profile_picture_url,omitempty"` + Experiences []Experience `json:"experiences,omitempty"` + Education []Education `json:"education,omitempty"` + Skills []Skill `json:"skills,omitempty"` +} From 8573dadee4cf7a206d9743c921d90862e6086155 Mon Sep 17 00:00:00 2001 From: teslashibe <34052452+teslashibe@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:04:15 -0700 Subject: [PATCH 015/136] docs: update README with LinkedIn profile fetching documentation - Add comprehensive overview of package capabilities - Document LinkedIn search and profile fetching usage examples - Update structure section with detailed file descriptions - Add backward compatibility information - Include installation instructions with v1.0.0 tag - Add release history section --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index df7476c..f979f35 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,10 @@ A shared type definitions package for Masa Finance TEE projects. +## Overview + +This package provides essential type definitions for communication between tee-worker and tee-indexer services. It supports various social media platforms including LinkedIn, Twitter, and TikTok, enabling both search and detailed profile fetching operations. + ## Minimal Sharing Approach This package follows a minimalist approach, sharing only the essential types needed for the interface between tee-worker and tee-indexer. This approach reduces coupling between the services while ensuring consistent communication. @@ -10,18 +14,66 @@ Each service should implement their own internal types that extend or build upon ## Structure -*WIP* +### Arguments (`args/`) +- `linkedin.go` - LinkedIn operation arguments supporting both search and profile fetching +- `twitter.go` - Twitter-specific arguments +- `tiktok.go` - TikTok-specific arguments +- `web.go` - General web scraping arguments + +### Types (`types/`) +- `linkedin.go` - LinkedIn result types including search results and detailed profile data +- `twitter.go` - Twitter result structures +- `tiktok.go` - TikTok result structures + +## LinkedIn Support + +### Search Operations +Use `LinkedInArguments` (or the deprecated `LinkedInSearchArguments`) for profile searches: + +```go +args := &args.LinkedInArguments{ + QueryType: "searchbyquery", + Query: "software engineer", + MaxResults: 10, + Start: 0, +} +``` + +### Profile Fetching +Use `LinkedInArguments` with `PublicIdentifier` for detailed profile retrieval: + +```go +args := &args.LinkedInArguments{ + QueryType: "getprofile", + PublicIdentifier: "john-doe-123", +} +``` + +### Result Types +- `LinkedInProfileResult` - Basic profile information from search results +- `LinkedInFullProfileResult` - Comprehensive profile data including experience, education, and skills ## Usage To use this package in your project, add it as a dependency: ```bash -go get github.com/masa-finance/tee-types +go get github.com/masa-finance/tee-types@v1.0.0 ``` Then import the required packages: ```go -import "github.com/masa-finance/tee-types/types" -``` \ No newline at end of file +import ( + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/types" +) +``` + +## Backward Compatibility + +The package maintains full backward compatibility. Existing code using `LinkedInSearchArguments` will continue to work, though migration to `LinkedInArguments` is recommended for future compatibility. + +## Releases + +- **v1.0.0** - Initial release with LinkedIn profile fetching support \ No newline at end of file From 1fd920ca07a035ef7a68747b115268f3e26290cb Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 24 Jul 2025 01:01:16 +0200 Subject: [PATCH 016/136] feat: jobs type --- types/jobs.go | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 types/jobs.go diff --git a/types/jobs.go b/types/jobs.go new file mode 100644 index 0000000..f10fcf7 --- /dev/null +++ b/types/jobs.go @@ -0,0 +1,40 @@ +package types + +type Capability string + +// JobType represents the type of job that can be executed +type JobType string + +// Job type constants - centralized from tee-indexer and tee-worker +const ( + // Web scraping job type + WebJob JobType = "web-scraper" + + // Telemetry job type for worker monitoring and stats + TelemetryJob JobType = "telemetry" + + // TikTok transcription job type + TiktokJob JobType = "tiktok-transcription" + + // Twitter job types + TwitterJob JobType = "twitter-scraper" // General Twitter scraping (uses best available auth) + TwitterCredentialJob JobType = "twitter-credential-scraper" // Twitter scraping with credentials + TwitterApiJob JobType = "twitter-api-scraper" // Twitter scraping with API keys + + // Unknown/invalid job type + UnknownJob JobType = "" +) + +// String returns the string representation of the JobType +func (j JobType) String() string { + return string(j) +} + +// JobCapability represents the capabilities of a specific job type +type JobCapability struct { + JobType string `json:"job_type"` + Capabilities []Capability `json:"capabilities"` +} + +// WorkerCapabilities represents all capabilities available on a worker +type WorkerCapabilities []JobCapability From 836de78e6e635f241586fecc2a78ff89e4599f43 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 24 Jul 2025 01:46:52 +0200 Subject: [PATCH 017/136] chore: rename jobs --- types/jobs.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index f10fcf7..c749752 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -8,21 +8,19 @@ type JobType string // Job type constants - centralized from tee-indexer and tee-worker const ( // Web scraping job type - WebJob JobType = "web-scraper" + WebJob JobType = "web" // Telemetry job type for worker monitoring and stats TelemetryJob JobType = "telemetry" // TikTok transcription job type - TiktokJob JobType = "tiktok-transcription" + TiktokJob JobType = "tiktok" // Twitter job types - TwitterJob JobType = "twitter-scraper" // General Twitter scraping (uses best available auth) - TwitterCredentialJob JobType = "twitter-credential-scraper" // Twitter scraping with credentials - TwitterApiJob JobType = "twitter-api-scraper" // Twitter scraping with API keys + TwitterJob JobType = "twitter" // General Twitter scraping (uses best available auth) + TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials + TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys - // Unknown/invalid job type - UnknownJob JobType = "" ) // String returns the string representation of the JobType From ab14dc77b8c839fb12a5177e7f2e52af0e504f57 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 24 Jul 2025 01:59:17 +0200 Subject: [PATCH 018/136] feat: bring in sub types --- types/jobs.go | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/types/jobs.go b/types/jobs.go index c749752..80bac75 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -23,6 +23,74 @@ const ( ) +// Capability constants - typed to prevent typos and enable discoverability +const ( + // Web scraping capabilities + CapWebScraper Capability = "web-scraper" + + // Telemetry capabilities + CapTelemetry Capability = "telemetry" + + // TikTok capabilities + CapTiktokTranscription Capability = "tiktok-transcription" + + // Twitter capabilities + CapSearchByQuery Capability = "searchbyquery" + CapSearchByFullArchive Capability = "searchbyfullarchive" + CapSearchByProfile Capability = "searchbyprofile" + CapGetById Capability = "getbyid" + CapGetReplies Capability = "getreplies" + CapGetRetweeters Capability = "getretweeters" + CapGetTweets Capability = "gettweets" + CapGetMedia Capability = "getmedia" + CapGetHomeTweets Capability = "gethometweets" + CapGetForYouTweets Capability = "getforyoutweets" + CapGetProfileById Capability = "getprofilebyid" + CapGetTrends Capability = "gettrends" + CapGetFollowing Capability = "getfollowing" + CapGetFollowers Capability = "getfollowers" + CapGetSpace Capability = "getspace" +) + +// Capability group constants for easy reuse +var ( + // AlwaysAvailableWebCaps are web capabilities always available + AlwaysAvailableWebCaps = []Capability{CapWebScraper} + + // AlwaysAvailableTelemetryCaps are telemetry capabilities always available + AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry} + + // AlwaysAvailableTiktokCaps are TikTok capabilities always available + AlwaysAvailableTiktokCaps = []Capability{CapTiktokTranscription} + + // TwitterAllCaps are all Twitter capabilities available with credential-based auth + TwitterAllCaps = []Capability{ + CapSearchByQuery, CapSearchByFullArchive, CapSearchByProfile, + CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, + CapGetHomeTweets, CapGetForYouTweets, CapGetProfileById, + CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, + } + + // TwitterAPICaps are basic Twitter capabilities available with API keys + TwitterAPICaps = []Capability{CapSearchByQuery, CapGetById, CapGetProfileById} + + // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration + AlwaysAvailableCapabilities = WorkerCapabilities{ + { + JobType: WebJob.String(), + Capabilities: AlwaysAvailableWebCaps, + }, + { + JobType: TelemetryJob.String(), + Capabilities: AlwaysAvailableTelemetryCaps, + }, + { + JobType: TiktokJob.String(), + Capabilities: AlwaysAvailableTiktokCaps, + }, + } +) + // String returns the string representation of the JobType func (j JobType) String() string { return string(j) From a01055aec5ebb718292031a10b361f3bb1a72759 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 25 Jul 2025 02:09:23 +0200 Subject: [PATCH 019/136] fix: use jobtype instead of string --- types/jobs.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index 80bac75..043c64b 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -77,15 +77,15 @@ var ( // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ { - JobType: WebJob.String(), + JobType: WebJob, Capabilities: AlwaysAvailableWebCaps, }, { - JobType: TelemetryJob.String(), + JobType: TelemetryJob, Capabilities: AlwaysAvailableTelemetryCaps, }, { - JobType: TiktokJob.String(), + JobType: TiktokJob, Capabilities: AlwaysAvailableTiktokCaps, }, } @@ -98,7 +98,7 @@ func (j JobType) String() string { // JobCapability represents the capabilities of a specific job type type JobCapability struct { - JobType string `json:"job_type"` + JobType JobType `json:"job_type"` Capabilities []Capability `json:"capabilities"` } From c2efbcaacf7435aaa71e8cf62b96ea3b1059df66 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 25 Jul 2025 20:45:34 +0200 Subject: [PATCH 020/136] fix: remove searchbyfullarchive subtype from cred based capabilities --- types/jobs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types/jobs.go b/types/jobs.go index 043c64b..a59b9b3 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -65,7 +65,7 @@ var ( // TwitterAllCaps are all Twitter capabilities available with credential-based auth TwitterAllCaps = []Capability{ - CapSearchByQuery, CapSearchByFullArchive, CapSearchByProfile, + CapSearchByQuery, CapSearchByProfile, CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, CapGetHomeTweets, CapGetForYouTweets, CapGetProfileById, CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, From 9f46715df681027468c3c768b69ab3d3ce5d765b Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 28 Jul 2025 20:54:09 +0200 Subject: [PATCH 021/136] feat: refactored jobs to jobtype as key --- types/jobs.go | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index a59b9b3..29eceff 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -26,13 +26,13 @@ const ( // Capability constants - typed to prevent typos and enable discoverability const ( // Web scraping capabilities - CapWebScraper Capability = "web-scraper" + CapWebScraper Capability = "scraper" // Telemetry capabilities CapTelemetry Capability = "telemetry" // TikTok capabilities - CapTiktokTranscription Capability = "tiktok-transcription" + CapTiktokTranscription Capability = "transcription" // Twitter capabilities CapSearchByQuery Capability = "searchbyquery" @@ -63,8 +63,8 @@ var ( // AlwaysAvailableTiktokCaps are TikTok capabilities always available AlwaysAvailableTiktokCaps = []Capability{CapTiktokTranscription} - // TwitterAllCaps are all Twitter capabilities available with credential-based auth - TwitterAllCaps = []Capability{ + // TwitterCredentialCaps are all Twitter capabilities available with credential-based auth + TwitterCredentialCaps = []Capability{ CapSearchByQuery, CapSearchByProfile, CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, CapGetHomeTweets, CapGetForYouTweets, CapGetProfileById, @@ -76,18 +76,9 @@ var ( // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ - { - JobType: WebJob, - Capabilities: AlwaysAvailableWebCaps, - }, - { - JobType: TelemetryJob, - Capabilities: AlwaysAvailableTelemetryCaps, - }, - { - JobType: TiktokJob, - Capabilities: AlwaysAvailableTiktokCaps, - }, + WebJob: AlwaysAvailableWebCaps, + TelemetryJob: AlwaysAvailableTelemetryCaps, + TiktokJob: AlwaysAvailableTiktokCaps, } ) @@ -96,11 +87,6 @@ func (j JobType) String() string { return string(j) } -// JobCapability represents the capabilities of a specific job type -type JobCapability struct { - JobType JobType `json:"job_type"` - Capabilities []Capability `json:"capabilities"` -} - // WorkerCapabilities represents all capabilities available on a worker -type WorkerCapabilities []JobCapability +// Maps JobType to the list of capabilities available for that job type +type WorkerCapabilities map[JobType][]Capability From 51f498814bcf87ed4460e9bbb4e28ef7149b02e8 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 29 Jul 2025 04:55:23 +0200 Subject: [PATCH 022/136] chore: adds profile type and apify caps --- types/jobs.go | 4 +++ types/twitter.go | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/types/jobs.go b/types/jobs.go index 29eceff..b746dbf 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -20,6 +20,7 @@ const ( TwitterJob JobType = "twitter" // General Twitter scraping (uses best available auth) TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys + TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify ) @@ -74,6 +75,9 @@ var ( // TwitterAPICaps are basic Twitter capabilities available with API keys TwitterAPICaps = []Capability{CapSearchByQuery, CapGetById, CapGetProfileById} + // TwitterApifyCaps are Twitter capabilities available with Apify + TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing} + // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ WebJob: AlwaysAvailableWebCaps, diff --git a/types/twitter.go b/types/twitter.go index 9e8c563..74b812b 100644 --- a/types/twitter.go +++ b/types/twitter.go @@ -71,3 +71,76 @@ type Video struct { URL string HLSURL string } + +type ProfileResultApify struct { + ID int64 `json:"id"` + IDStr string `json:"id_str"` + Name string `json:"name"` + ScreenName string `json:"screen_name"` + Location string `json:"location"` + Description string `json:"description"` + URL *string `json:"url"` + Entities ProfileEntities `json:"entities"` + Protected bool `json:"protected"` + FollowersCount int `json:"followers_count"` + FastFollowersCount int `json:"fast_followers_count"` + NormalFollowersCount int `json:"normal_followers_count"` + FriendsCount int `json:"friends_count"` + ListedCount int `json:"listed_count"` + CreatedAt string `json:"created_at"` + FavouritesCount int `json:"favourites_count"` + UTCOffset *int `json:"utc_offset"` + TimeZone *string `json:"time_zone"` + GeoEnabled bool `json:"geo_enabled"` + Verified bool `json:"verified"` + StatusesCount int `json:"statuses_count"` + MediaCount int `json:"media_count"` + Lang *string `json:"lang"` + ContributorsEnabled bool `json:"contributors_enabled"` + IsTranslator bool `json:"is_translator"` + IsTranslationEnabled bool `json:"is_translation_enabled"` + ProfileBackgroundColor string `json:"profile_background_color"` + ProfileBackgroundImageURL *string `json:"profile_background_image_url"` + ProfileBackgroundImageURLHTTPS *string `json:"profile_background_image_url_https"` + ProfileBackgroundTile bool `json:"profile_background_tile"` + ProfileImageURL string `json:"profile_image_url"` + ProfileImageURLHTTPS string `json:"profile_image_url_https"` + ProfileLinkColor string `json:"profile_link_color"` + ProfileSidebarBorderColor string `json:"profile_sidebar_border_color"` + ProfileSidebarFillColor string `json:"profile_sidebar_fill_color"` + ProfileTextColor string `json:"profile_text_color"` + ProfileUseBackgroundImage bool `json:"profile_use_background_image"` + HasExtendedProfile bool `json:"has_extended_profile"` + DefaultProfile bool `json:"default_profile"` + DefaultProfileImage bool `json:"default_profile_image"` + PinnedTweetIDs []int64 `json:"pinned_tweet_ids"` + PinnedTweetIDsStr []string `json:"pinned_tweet_ids_str"` + HasCustomTimelines bool `json:"has_custom_timelines"` + CanMediaTag bool `json:"can_media_tag"` + FollowedBy bool `json:"followed_by"` + Following bool `json:"following"` + LiveFollowing bool `json:"live_following"` + FollowRequestSent bool `json:"follow_request_sent"` + Notifications bool `json:"notifications"` + Muting bool `json:"muting"` + Blocking bool `json:"blocking"` + BlockedBy bool `json:"blocked_by"` + AdvertiserAccountType string `json:"advertiser_account_type"` + AdvertiserAccountServiceLevels []string `json:"advertiser_account_service_levels"` + AnalyticsType string `json:"analytics_type"` + BusinessProfileState string `json:"business_profile_state"` + TranslatorType string `json:"translator_type"` + WithheldInCountries []string `json:"withheld_in_countries"` + RequireSomeConsent bool `json:"require_some_consent"` + Type string `json:"type"` + TargetUsername string `json:"target_username"` + Email *string `json:"email"` +} + +type ProfileEntities struct { + Description DescriptionEntities `json:"description"` +} + +type DescriptionEntities struct { + URLs []any `json:"urls"` +} From 3bf32fc7050b10a0b4c50de4e99cd3a03bda9bc4 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 30 Jul 2025 19:17:53 +0200 Subject: [PATCH 023/136] feat: add unmarshalling for arguments per job type --- args/linkedin.go | 51 +++++++++++++++ args/tiktok.go | 101 +++++++++++++++++++++++++++++- args/twitter.go | 77 +++++++++++++++++++++++ args/unmarshaller.go | 145 +++++++++++++++++++++++++++++++++++++++++++ args/web.go | 79 +++++++++++++++++++++++ 5 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 args/unmarshaller.go diff --git a/args/linkedin.go b/args/linkedin.go index bd94dbd..e32ba03 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -1,5 +1,10 @@ package args +import ( + "encoding/json" + "fmt" +) + // LinkedInArguments defines args for LinkedIn operations type LinkedInArguments struct { QueryType string `json:"type"` // "searchbyquery", "getprofile" @@ -10,6 +15,52 @@ type LinkedInArguments struct { Start int `json:"start"` // Pagination start offset } +// UnmarshalJSON implements custom JSON unmarshaling with validation +func (l *LinkedInArguments) UnmarshalJSON(data []byte) error { + type Alias LinkedInArguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(l), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal LinkedIn arguments: %w", err) + } + + return l.Validate() +} + +// Validate validates the LinkedIn arguments +func (l *LinkedInArguments) Validate() error { + if l.QueryType == "" { + return fmt.Errorf("type is required") + } + + // Validate query type + validTypes := map[string]bool{ + "searchbyquery": true, + "getprofile": true, + } + if !validTypes[l.QueryType] { + return fmt.Errorf("invalid type: %s, must be one of: searchbyquery, getprofile", l.QueryType) + } + + if l.Query == "" { + return fmt.Errorf("query is required") + } + + if l.MaxResults < 0 { + return fmt.Errorf("max_results must be non-negative, got: %d", l.MaxResults) + } + + if l.Start < 0 { + return fmt.Errorf("start must be non-negative, got: %d", l.Start) + } + + return nil +} + // LinkedInSearchArguments is an alias for LinkedInArguments for backward compatibility. // Deprecated: use LinkedInArguments instead. type LinkedInSearchArguments = LinkedInArguments diff --git a/args/tiktok.go b/args/tiktok.go index 9a082d9..29aa59c 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -1,7 +1,106 @@ package args +import ( + "encoding/json" + "fmt" + "net/url" + "strings" + + teetypes "github.com/masa-finance/tee-types/types" +) + // TikTokTranscriptionArguments defines args for TikTok transcriptions type TikTokTranscriptionArguments struct { VideoURL string `json:"video_url"` Language string `json:"language,omitempty"` // e.g., "eng-US" -} \ No newline at end of file +} + +// UnmarshalJSON implements custom JSON unmarshaling with validation +func (t *TikTokTranscriptionArguments) UnmarshalJSON(data []byte) error { + type Alias TikTokTranscriptionArguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(t), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) + } + + // Normalize language to lowercase if provided + if t.Language != "" { + t.Language = strings.ToLower(t.Language) + } + + return t.Validate() +} + +// Validate validates the TikTok arguments +func (t *TikTokTranscriptionArguments) Validate() error { + if t.VideoURL == "" { + return fmt.Errorf("video_url is required") + } + + // Validate URL format + parsedURL, err := url.Parse(t.VideoURL) + if err != nil { + return fmt.Errorf("invalid video_url format: %w", err) + } + + // Basic TikTok URL validation + if !t.IsTikTokURL(parsedURL) { + return fmt.Errorf("URL must be a valid TikTok video URL") + } + + // Validate language format if provided + if t.Language != "" { + if err := t.validateLanguageCode(); err != nil { + return err + } + } + + return nil +} + +// GetCapability returns the capability for TikTok operations (always transcription) +func (t *TikTokTranscriptionArguments) GetCapability() teetypes.Capability { + return teetypes.CapTiktokTranscription +} + +// IsTikTokURL validates if the URL is a TikTok URL +func (t *TikTokTranscriptionArguments) IsTikTokURL(parsedURL *url.URL) bool { + host := strings.ToLower(parsedURL.Host) + return host == "tiktok.com" || + host == "www.tiktok.com" || + host == "vm.tiktok.com" || + strings.HasSuffix(host, ".tiktok.com") +} + +// HasLanguagePreference returns true if a language preference is specified +func (t *TikTokTranscriptionArguments) HasLanguagePreference() bool { + return t.Language != "" +} + +// GetLanguageCode returns the language code, defaulting to "en-us" if not specified +func (t *TikTokTranscriptionArguments) GetLanguageCode() string { + if t.Language == "" { + return "en-us" + } + return t.Language +} + +// validateLanguageCode validates the language code format +func (t *TikTokTranscriptionArguments) validateLanguageCode() error { + // Basic validation for language codes like "en-us", "es-es", etc. + parts := strings.Split(t.Language, "-") + if len(parts) != 2 { + return fmt.Errorf("invalid language format '%s', expected format: 'lang-region' (e.g., 'en-us')", t.Language) + } + + if len(parts[0]) != 2 || len(parts[1]) != 2 { + return fmt.Errorf("invalid language format '%s', expected 2-letter language and region codes", t.Language) + } + + return nil +} diff --git a/args/twitter.go b/args/twitter.go index 7fc4a9b..7bb22dc 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -1,5 +1,13 @@ package args +import ( + "encoding/json" + "fmt" + "strings" + + teetypes "github.com/masa-finance/tee-types/types" +) + // TwitterSearchArguments defines args for Twitter searches type TwitterSearchArguments struct { QueryType string `json:"type"` // Optional, type of search @@ -10,3 +18,72 @@ type TwitterSearchArguments struct { MaxResults int `json:"max_results"` // Optional, max number of results NextCursor string `json:"next_cursor"` } + +// UnmarshalJSON implements custom JSON unmarshaling with validation +func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { + type Alias TwitterSearchArguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(t), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal Twitter arguments: %w", err) + } + + // Normalize QueryType to lowercase + if t.QueryType != "" { + t.QueryType = strings.ToLower(t.QueryType) + } + + return t.Validate() +} + +// Validate validates the Twitter arguments (general validation) +func (t *TwitterSearchArguments) Validate() error { + if t.Query == "" { + return fmt.Errorf("query is required") + } + + if t.Count < 0 { + return fmt.Errorf("count must be non-negative, got: %d", t.Count) + } + + if t.MaxResults < 0 { + return fmt.Errorf("max_results must be non-negative, got: %d", t.MaxResults) + } + + return nil +} + +// ValidateForJobType validates Twitter arguments for a specific job type +func (t *TwitterSearchArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := t.Validate(); err != nil { + return err + } + + // Validate QueryType against job-specific capabilities + return ValidateCapabilityForJobType(jobType, t.QueryType) +} + +// GetCapability returns the QueryType as a typed Capability +func (t *TwitterSearchArguments) GetCapability() teetypes.Capability { + return teetypes.Capability(t.QueryType) +} + +// IsNonTweetOperation returns true if the QueryType represents a non-tweet operation +// This replaces the manual string checking from the TODO comment +// NO STRING CASTING - uses capability constants directly +func (t *TwitterSearchArguments) IsNonTweetOperation() bool { + capability := t.GetCapability() + + return capability == teetypes.CapSearchByProfile || + capability == teetypes.CapGetRetweeters || + capability == teetypes.CapGetProfileById || + capability == teetypes.CapGetById || + capability == teetypes.CapGetSpace || + capability == teetypes.CapGetTrends || + capability == teetypes.CapGetFollowing || + capability == teetypes.CapGetFollowers +} diff --git a/args/unmarshaller.go b/args/unmarshaller.go new file mode 100644 index 0000000..21daed7 --- /dev/null +++ b/args/unmarshaller.go @@ -0,0 +1,145 @@ +package args + +import ( + "encoding/json" + "fmt" + + "github.com/masa-finance/tee-types/types" +) + +// JobArgumentsInterface defines the interface that all job arguments must implement +type JobArgumentsInterface interface { + Validate() error + GetCapability() types.Capability +} + +// TwitterJobArgumentsInterface extends JobArgumentsInterface for Twitter-specific methods +type TwitterJobArgumentsInterface interface { + JobArgumentsInterface + ValidateForJobType(jobType types.JobType) error + IsNonTweetOperation() bool +} + +// WebJobArgumentsInterface extends JobArgumentsInterface for Web-specific methods +type WebJobArgumentsInterface interface { + JobArgumentsInterface + IsDeepScrape() bool + HasSelector() bool + GetEffectiveMaxDepth() int +} + +// TikTokJobArgumentsInterface extends JobArgumentsInterface for TikTok-specific methods +type TikTokJobArgumentsInterface interface { + JobArgumentsInterface + HasLanguagePreference() bool + GetLanguageCode() string +} + +// UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct +// This works with both tee-indexer and tee-worker JobArguments types +func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgumentsInterface, error) { + switch jobType { + case types.WebJob: + return unmarshalWebArguments(args) + + case types.TiktokJob: + return unmarshalTikTokArguments(args) + + case types.TwitterJob, types.TwitterCredentialJob, types.TwitterApiJob, types.TwitterApifyJob: + return unmarshalTwitterArguments(jobType, args) + + case types.TelemetryJob: + return &TelemetryJobArguments{}, nil + + default: + return nil, fmt.Errorf("unknown job type: %s", jobType) + } +} + +// Helper functions for unmarshaling specific argument types +func unmarshalWebArguments(args map[string]any) (*WebSearchArguments, error) { + webArgs := &WebSearchArguments{} + if err := unmarshalToStruct(args, webArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal web job arguments: %w", err) + } + return webArgs, nil +} + +func unmarshalTikTokArguments(args map[string]any) (*TikTokTranscriptionArguments, error) { + tiktokArgs := &TikTokTranscriptionArguments{} + if err := unmarshalToStruct(args, tiktokArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok job arguments: %w", err) + } + return tiktokArgs, nil +} + +func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*TwitterSearchArguments, error) { + twitterArgs := &TwitterSearchArguments{} + if err := unmarshalToStruct(args, twitterArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal Twitter job arguments: %w", err) + } + + // Perform job-type-specific validation for Twitter + if err := twitterArgs.ValidateForJobType(jobType); err != nil { + return nil, fmt.Errorf("Twitter job validation failed: %w", err) + } + + return twitterArgs, nil +} + +// unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal +// This provides the same functionality as the existing JobArguments.Unmarshal methods +func unmarshalToStruct(args map[string]any, target any) error { + // Use JSON marshal/unmarshal for conversion - this triggers our custom UnmarshalJSON methods + data, err := json.Marshal(args) + if err != nil { + return fmt.Errorf("failed to marshal arguments: %w", err) + } + + if err := json.Unmarshal(data, target); err != nil { + return fmt.Errorf("failed to unmarshal arguments: %w", err) + } + + return nil +} + +// TelemetryJobArguments for telemetry jobs (simple case) +type TelemetryJobArguments struct{} + +func (t *TelemetryJobArguments) Validate() error { + return nil +} + +func (t *TelemetryJobArguments) GetCapability() types.Capability { + return types.CapTelemetry +} + +// Type assertion helpers +func AsWebArguments(args JobArgumentsInterface) (WebJobArgumentsInterface, bool) { + webArgs, ok := args.(*WebSearchArguments) + if !ok { + return nil, false + } + return webArgs, true +} + +func AsTwitterArguments(args JobArgumentsInterface) (TwitterJobArgumentsInterface, bool) { + twitterArgs, ok := args.(*TwitterSearchArguments) + if !ok { + return nil, false + } + return twitterArgs, true +} + +func AsTikTokArguments(args JobArgumentsInterface) (TikTokJobArgumentsInterface, bool) { + tiktokArgs, ok := args.(*TikTokTranscriptionArguments) + if !ok { + return nil, false + } + return tiktokArgs, true +} + +func AsTelemetryArguments(args JobArgumentsInterface) (*TelemetryJobArguments, bool) { + telemetryArgs, ok := args.(*TelemetryJobArguments) + return telemetryArgs, ok +} diff --git a/args/web.go b/args/web.go index 66e2093..8d8782b 100644 --- a/args/web.go +++ b/args/web.go @@ -1,8 +1,87 @@ package args +import ( + "encoding/json" + "fmt" + "net/url" + + teetypes "github.com/masa-finance/tee-types/types" +) + type WebSearchArguments struct { URL string `json:"url"` Selector string `json:"selector"` Depth int `json:"depth"` MaxDepth int `json:"max_depth"` } + +// UnmarshalJSON implements custom JSON unmarshaling with validation +func (w *WebSearchArguments) UnmarshalJSON(data []byte) error { + type Alias WebSearchArguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(w), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal Web arguments: %w", err) + } + + return w.Validate() +} + +// Validate validates the Web arguments +func (w *WebSearchArguments) Validate() error { + if w.URL == "" { + return fmt.Errorf("url is required") + } + + // Validate URL format + parsedURL, err := url.Parse(w.URL) + if err != nil { + return fmt.Errorf("invalid URL format: %w", err) + } + + // Ensure URL has a scheme + if parsedURL.Scheme == "" { + return fmt.Errorf("URL must include a scheme (http:// or https://)") + } + + if w.MaxDepth < 0 { + return fmt.Errorf("max_depth must be non-negative, got: %d", w.MaxDepth) + } + + if w.Depth < 0 { + return fmt.Errorf("depth must be non-negative, got: %d", w.Depth) + } + + if w.Depth > w.MaxDepth && w.MaxDepth > 0 { + return fmt.Errorf("depth (%d) cannot exceed max_depth (%d)", w.Depth, w.MaxDepth) + } + + return nil +} + +// GetCapability returns the capability for web operations (always scraper) +func (w *WebSearchArguments) GetCapability() teetypes.Capability { + return teetypes.CapWebScraper +} + +// IsDeepScrape returns true if this is a deep scraping operation +func (w *WebSearchArguments) IsDeepScrape() bool { + return w.MaxDepth > 1 || w.Depth > 0 +} + +// HasSelector returns true if a CSS selector is specified +func (w *WebSearchArguments) HasSelector() bool { + return w.Selector != "" +} + +// GetEffectiveMaxDepth returns the effective maximum depth for scraping +func (w *WebSearchArguments) GetEffectiveMaxDepth() int { + if w.MaxDepth <= 0 { + return 1 // Default to single page + } + return w.MaxDepth +} From 38a06be1773c2b014cf87a9ed4c3035bc6c2d2d9 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 30 Jul 2025 20:33:32 +0200 Subject: [PATCH 024/136] fix: better type validation --- args/linkedin.go | 23 ++++++++++++++ args/tiktok.go | 10 ++++++ args/twitter.go | 2 +- args/unmarshaller.go | 49 ++++++++++++++++++++++++----- args/validation.go | 75 ++++++++++++++++++++++++++++++++++++++++++++ args/web.go | 10 ++++++ types/jobs.go | 9 ++++++ 7 files changed, 169 insertions(+), 9 deletions(-) create mode 100644 args/validation.go diff --git a/args/linkedin.go b/args/linkedin.go index e32ba03..dcdad5f 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -3,6 +3,9 @@ package args import ( "encoding/json" "fmt" + "strings" + + teetypes "github.com/masa-finance/tee-types/types" ) // LinkedInArguments defines args for LinkedIn operations @@ -28,6 +31,11 @@ func (l *LinkedInArguments) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal LinkedIn arguments: %w", err) } + // Normalize QueryType to lowercase + if l.QueryType != "" { + l.QueryType = strings.ToLower(l.QueryType) + } + return l.Validate() } @@ -61,6 +69,21 @@ func (l *LinkedInArguments) Validate() error { return nil } +// ValidateForJobType validates LinkedIn arguments for a specific job type +func (l *LinkedInArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := l.Validate(); err != nil { + return err + } + + // Validate QueryType against job-specific capabilities + return ValidateCapabilityForJobType(jobType, teetypes.Capability(l.QueryType)) +} + +// GetCapability returns the QueryType as a typed Capability +func (l *LinkedInArguments) GetCapability() teetypes.Capability { + return teetypes.Capability(l.QueryType) +} + // LinkedInSearchArguments is an alias for LinkedInArguments for backward compatibility. // Deprecated: use LinkedInArguments instead. type LinkedInSearchArguments = LinkedInArguments diff --git a/args/tiktok.go b/args/tiktok.go index 29aa59c..82bd79f 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -90,6 +90,16 @@ func (t *TikTokTranscriptionArguments) GetLanguageCode() string { return t.Language } +// ValidateForJobType validates TikTok arguments for a specific job type +func (t *TikTokTranscriptionArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := t.Validate(); err != nil { + return err + } + + // Validate capability against job-specific capabilities + return ValidateCapabilityForJobType(jobType, t.GetCapability()) +} + // validateLanguageCode validates the language code format func (t *TikTokTranscriptionArguments) validateLanguageCode() error { // Basic validation for language codes like "en-us", "es-es", etc. diff --git a/args/twitter.go b/args/twitter.go index 7bb22dc..1e4bbec 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -64,7 +64,7 @@ func (t *TwitterSearchArguments) ValidateForJobType(jobType teetypes.JobType) er } // Validate QueryType against job-specific capabilities - return ValidateCapabilityForJobType(jobType, t.QueryType) + return ValidateCapabilityForJobType(jobType, teetypes.Capability(t.QueryType)) } // GetCapability returns the QueryType as a typed Capability diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 21daed7..4bfd4c7 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -23,6 +23,7 @@ type TwitterJobArgumentsInterface interface { // WebJobArgumentsInterface extends JobArgumentsInterface for Web-specific methods type WebJobArgumentsInterface interface { JobArgumentsInterface + ValidateForJobType(jobType types.JobType) error IsDeepScrape() bool HasSelector() bool GetEffectiveMaxDepth() int @@ -31,26 +32,36 @@ type WebJobArgumentsInterface interface { // TikTokJobArgumentsInterface extends JobArgumentsInterface for TikTok-specific methods type TikTokJobArgumentsInterface interface { JobArgumentsInterface + ValidateForJobType(jobType types.JobType) error HasLanguagePreference() bool GetLanguageCode() string } +// LinkedInJobArgumentsInterface extends JobArgumentsInterface for LinkedIn-specific methods +type LinkedInJobArgumentsInterface interface { + JobArgumentsInterface + ValidateForJobType(jobType types.JobType) error +} + // UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct // This works with both tee-indexer and tee-worker JobArguments types func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgumentsInterface, error) { switch jobType { case types.WebJob: return unmarshalWebArguments(args) - + case types.TiktokJob: return unmarshalTikTokArguments(args) - + case types.TwitterJob, types.TwitterCredentialJob, types.TwitterApiJob, types.TwitterApifyJob: return unmarshalTwitterArguments(jobType, args) - + + case types.LinkedInJob: + return unmarshalLinkedInArguments(jobType, args) + case types.TelemetryJob: return &TelemetryJobArguments{}, nil - + default: return nil, fmt.Errorf("unknown job type: %s", jobType) } @@ -78,15 +89,29 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi if err := unmarshalToStruct(args, twitterArgs); err != nil { return nil, fmt.Errorf("failed to unmarshal Twitter job arguments: %w", err) } - + // Perform job-type-specific validation for Twitter if err := twitterArgs.ValidateForJobType(jobType); err != nil { return nil, fmt.Errorf("Twitter job validation failed: %w", err) } - + return twitterArgs, nil } +func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*LinkedInArguments, error) { + linkedInArgs := &LinkedInArguments{} + if err := unmarshalToStruct(args, linkedInArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) + } + + // Perform job-type-specific validation for LinkedIn + if err := linkedInArgs.ValidateForJobType(jobType); err != nil { + return nil, fmt.Errorf("LinkedIn job validation failed: %w", err) + } + + return linkedInArgs, nil +} + // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal // This provides the same functionality as the existing JobArguments.Unmarshal methods func unmarshalToStruct(args map[string]any, target any) error { @@ -95,11 +120,11 @@ func unmarshalToStruct(args map[string]any, target any) error { if err != nil { return fmt.Errorf("failed to marshal arguments: %w", err) } - + if err := json.Unmarshal(data, target); err != nil { return fmt.Errorf("failed to unmarshal arguments: %w", err) } - + return nil } @@ -143,3 +168,11 @@ func AsTelemetryArguments(args JobArgumentsInterface) (*TelemetryJobArguments, b telemetryArgs, ok := args.(*TelemetryJobArguments) return telemetryArgs, ok } + +func AsLinkedInArguments(args JobArgumentsInterface) (LinkedInJobArgumentsInterface, bool) { + linkedInArgs, ok := args.(*LinkedInArguments) + if !ok { + return nil, false + } + return linkedInArgs, true +} diff --git a/args/validation.go b/args/validation.go new file mode 100644 index 0000000..6d44d3a --- /dev/null +++ b/args/validation.go @@ -0,0 +1,75 @@ +package args + +import ( + "fmt" + "slices" + + teetypes "github.com/masa-finance/tee-types/types" +) + +// jobCapabilityMap defines which capabilities are valid for each job type +var jobCapabilityMap = map[teetypes.JobType][]teetypes.Capability{ + // Twitter job types and their valid capabilities + teetypes.TwitterJob: append(append(append( + teetypes.TwitterCredentialCaps, + teetypes.TwitterAPICaps...), + teetypes.TwitterApifyCaps...), + teetypes.CapSearchByFullArchive, // Elevated API capability + ), + teetypes.TwitterCredentialJob: teetypes.TwitterCredentialCaps, + teetypes.TwitterApiJob: append( + teetypes.TwitterAPICaps, + teetypes.CapSearchByFullArchive, // Elevated API capability + ), + teetypes.TwitterApifyJob: teetypes.TwitterApifyCaps, + + // Web job capabilities + teetypes.WebJob: teetypes.AlwaysAvailableWebCaps, + + // TikTok job capabilities + teetypes.TiktokJob: teetypes.AlwaysAvailableTiktokCaps, + + // Telemetry job capabilities + teetypes.TelemetryJob: teetypes.AlwaysAvailableTelemetryCaps, + + // LinkedIn job capabilities + teetypes.LinkedInJob: teetypes.LinkedInCaps, +} + +// ValidateCapabilityForJobType validates that a capability is supported for the given job type +func ValidateCapabilityForJobType(jobType teetypes.JobType, capability teetypes.Capability) error { + if capability == "" { + // Empty capability is allowed for some job types + return nil + } + + validCaps, exists := jobCapabilityMap[jobType] + if !exists { + return fmt.Errorf("unknown job type: %s", jobType) + } + + if !slices.Contains(validCaps, capability) { + return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", + capability, jobType, validCaps) + } + + return nil +} + +// GetValidCapabilitiesForJobType returns all valid capabilities for a given job type +func GetValidCapabilitiesForJobType(jobType teetypes.JobType) ([]teetypes.Capability, error) { + validCaps, exists := jobCapabilityMap[jobType] + if !exists { + return nil, fmt.Errorf("unknown job type: %s", jobType) + } + + // Return a copy to prevent external modification + result := make([]teetypes.Capability, len(validCaps)) + copy(result, validCaps) + return result, nil +} + +// IsCapabilityValidForJobType checks if a capability is valid for a job type without returning an error +func IsCapabilityValidForJobType(jobType teetypes.JobType, capability teetypes.Capability) bool { + return ValidateCapabilityForJobType(jobType, capability) == nil +} diff --git a/args/web.go b/args/web.go index 8d8782b..8318278 100644 --- a/args/web.go +++ b/args/web.go @@ -63,6 +63,16 @@ func (w *WebSearchArguments) Validate() error { return nil } +// ValidateForJobType validates Web arguments for a specific job type +func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := w.Validate(); err != nil { + return err + } + + // Validate capability against job-specific capabilities + return ValidateCapabilityForJobType(jobType, w.GetCapability()) +} + // GetCapability returns the capability for web operations (always scraper) func (w *WebSearchArguments) GetCapability() teetypes.Capability { return teetypes.CapWebScraper diff --git a/types/jobs.go b/types/jobs.go index b746dbf..dd31f9c 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -22,6 +22,9 @@ const ( TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify + // LinkedIn job types + LinkedInJob JobType = "linkedin" // LinkedIn scraping and profile operations + ) // Capability constants - typed to prevent typos and enable discoverability @@ -51,6 +54,9 @@ const ( CapGetFollowing Capability = "getfollowing" CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" + + // LinkedIn capabilities + CapGetProfile Capability = "getprofile" ) // Capability group constants for easy reuse @@ -78,6 +84,9 @@ var ( // TwitterApifyCaps are Twitter capabilities available with Apify TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing} + // LinkedInCaps are LinkedIn capabilities (basic set for future implementation) + LinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile} + // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ WebJob: AlwaysAvailableWebCaps, From b3b2d471f1560a2be67b4b86b74bfac50dccf65c Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 01:50:02 +0200 Subject: [PATCH 025/136] fix: remove linked in from job types --- types/jobs.go | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index dd31f9c..612cc76 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -21,10 +21,6 @@ const ( TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify - - // LinkedIn job types - LinkedInJob JobType = "linkedin" // LinkedIn scraping and profile operations - ) // Capability constants - typed to prevent typos and enable discoverability @@ -54,9 +50,6 @@ const ( CapGetFollowing Capability = "getfollowing" CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" - - // LinkedIn capabilities - CapGetProfile Capability = "getprofile" ) // Capability group constants for easy reuse @@ -84,9 +77,6 @@ var ( // TwitterApifyCaps are Twitter capabilities available with Apify TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing} - // LinkedInCaps are LinkedIn capabilities (basic set for future implementation) - LinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile} - // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ WebJob: AlwaysAvailableWebCaps, From 193438818b2baaa8c9683458f1c2acd1c39f7768 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 01:54:58 +0200 Subject: [PATCH 026/136] chore: cleanup linkedin and twitter --- args/twitter.go | 3 --- args/unmarshaller.go | 3 --- args/validation.go | 3 --- 3 files changed, 9 deletions(-) diff --git a/args/twitter.go b/args/twitter.go index 1e4bbec..2ec253e 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -73,15 +73,12 @@ func (t *TwitterSearchArguments) GetCapability() teetypes.Capability { } // IsNonTweetOperation returns true if the QueryType represents a non-tweet operation -// This replaces the manual string checking from the TODO comment -// NO STRING CASTING - uses capability constants directly func (t *TwitterSearchArguments) IsNonTweetOperation() bool { capability := t.GetCapability() return capability == teetypes.CapSearchByProfile || capability == teetypes.CapGetRetweeters || capability == teetypes.CapGetProfileById || - capability == teetypes.CapGetById || capability == teetypes.CapGetSpace || capability == teetypes.CapGetTrends || capability == teetypes.CapGetFollowing || diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 4bfd4c7..95bf3b7 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -56,9 +56,6 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum case types.TwitterJob, types.TwitterCredentialJob, types.TwitterApiJob, types.TwitterApifyJob: return unmarshalTwitterArguments(jobType, args) - case types.LinkedInJob: - return unmarshalLinkedInArguments(jobType, args) - case types.TelemetryJob: return &TelemetryJobArguments{}, nil diff --git a/args/validation.go b/args/validation.go index 6d44d3a..b8b6e6e 100644 --- a/args/validation.go +++ b/args/validation.go @@ -31,9 +31,6 @@ var jobCapabilityMap = map[teetypes.JobType][]teetypes.Capability{ // Telemetry job capabilities teetypes.TelemetryJob: teetypes.AlwaysAvailableTelemetryCaps, - - // LinkedIn job capabilities - teetypes.LinkedInJob: teetypes.LinkedInCaps, } // ValidateCapabilityForJobType validates that a capability is supported for the given job type From e186771bede5d91bec2d8f75508a8773421ba8a4 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 02:06:30 +0200 Subject: [PATCH 027/136] fix: don't require query on twitter --- args/twitter.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/args/twitter.go b/args/twitter.go index 2ec253e..588dacb 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -42,9 +42,7 @@ func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { // Validate validates the Twitter arguments (general validation) func (t *TwitterSearchArguments) Validate() error { - if t.Query == "" { - return fmt.Errorf("query is required") - } + // note, query is not required for all capabilities if t.Count < 0 { return fmt.Errorf("count must be non-negative, got: %d", t.Count) From a91ba0558b34122a8405fde594becc5a2cae0203 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 02:18:36 +0200 Subject: [PATCH 028/136] fix: adds single tweet operation --- args/twitter.go | 6 ++++++ args/unmarshaller.go | 1 + 2 files changed, 7 insertions(+) diff --git a/args/twitter.go b/args/twitter.go index 588dacb..12af8fd 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -82,3 +82,9 @@ func (t *TwitterSearchArguments) IsNonTweetOperation() bool { capability == teetypes.CapGetFollowing || capability == teetypes.CapGetFollowers } + +// IsSingleTweetOperation returns true if the QueryType represents an operation that returns a single tweet +func (t *TwitterSearchArguments) IsSingleTweetOperation() bool { + capability := t.GetCapability() + return capability == teetypes.CapGetById +} diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 95bf3b7..2fee88d 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -18,6 +18,7 @@ type TwitterJobArgumentsInterface interface { JobArgumentsInterface ValidateForJobType(jobType types.JobType) error IsNonTweetOperation() bool + IsSingleTweetOperation() bool } // WebJobArgumentsInterface extends JobArgumentsInterface for Web-specific methods From 7a109955c256af0780bf318dd56a7c2f19390a8e Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 19:47:06 +0200 Subject: [PATCH 029/136] chore: cleanup arguments unmarshaller and job types --- args/jobs.go | 33 +++++++++++++ args/tiktok.go | 2 +- args/twitter.go | 44 ++++++++++++----- args/unmarshaller.go | 28 ++++++----- args/validation.go | 72 --------------------------- args/web.go | 2 +- types/jobs.go | 114 +++++++++++++++++++++++++++---------------- 7 files changed, 155 insertions(+), 140 deletions(-) create mode 100644 args/jobs.go delete mode 100644 args/validation.go diff --git a/args/jobs.go b/args/jobs.go new file mode 100644 index 0000000..1a6c89f --- /dev/null +++ b/args/jobs.go @@ -0,0 +1,33 @@ +package args + +import ( + "fmt" + "slices" + + teetypes "github.com/masa-finance/tee-types/types" +) + +// ValidateCapabilityForJobType validates that a capability is supported for the given job type +func ValidateCapabilityForJobType(jobType teetypes.JobType, capability teetypes.Capability) error { + if capability == "" { + // Empty capability is allowed for some job types + return nil + } + + validCaps, exists := teetypes.JobCapabilityMap[jobType] + if !exists { + return fmt.Errorf("unknown job type: %s", jobType) + } + + if !slices.Contains(validCaps, capability) { + return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", + capability, jobType, validCaps) + } + + return nil +} + +// IsCapabilityValidForJobType checks if a capability is valid for a job type without returning an error +func IsCapabilityValidForJobType(jobType teetypes.JobType, capability teetypes.Capability) bool { + return ValidateCapabilityForJobType(jobType, capability) == nil +} diff --git a/args/tiktok.go b/args/tiktok.go index 82bd79f..cffdbac 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -65,7 +65,7 @@ func (t *TikTokTranscriptionArguments) Validate() error { // GetCapability returns the capability for TikTok operations (always transcription) func (t *TikTokTranscriptionArguments) GetCapability() teetypes.Capability { - return teetypes.CapTiktokTranscription + return teetypes.CapTranscription } // IsTikTokURL validates if the URL is a TikTok URL diff --git a/args/twitter.go b/args/twitter.go index 12af8fd..8fce9a4 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -70,21 +70,41 @@ func (t *TwitterSearchArguments) GetCapability() teetypes.Capability { return teetypes.Capability(t.QueryType) } -// IsNonTweetOperation returns true if the QueryType represents a non-tweet operation -func (t *TwitterSearchArguments) IsNonTweetOperation() bool { +func (t *TwitterSearchArguments) IsSingleTweetOperation() bool { capability := t.GetCapability() + return capability == teetypes.CapGetById +} - return capability == teetypes.CapSearchByProfile || - capability == teetypes.CapGetRetweeters || - capability == teetypes.CapGetProfileById || - capability == teetypes.CapGetSpace || - capability == teetypes.CapGetTrends || - capability == teetypes.CapGetFollowing || - capability == teetypes.CapGetFollowers +func (t *TwitterSearchArguments) IsMultipleTweetOperation() bool { + capability := t.GetCapability() + return capability == teetypes.CapSearchByQuery || + capability == teetypes.CapSearchByFullArchive || + capability == teetypes.CapGetHomeTweets || + capability == teetypes.CapGetForYouTweets || + capability == teetypes.CapGetTweets || + capability == teetypes.CapGetReplies || + capability == teetypes.CapGetMedia } -// IsSingleTweetOperation returns true if the QueryType represents an operation that returns a single tweet -func (t *TwitterSearchArguments) IsSingleTweetOperation() bool { +func (t *TwitterSearchArguments) IsSingleProfileOperation() bool { capability := t.GetCapability() - return capability == teetypes.CapGetById + return capability == teetypes.CapGetProfileById || + capability == teetypes.CapSearchByProfile +} + +func (t *TwitterSearchArguments) IsMultipleProfileOperation() bool { + capability := t.GetCapability() + return capability == teetypes.CapGetFollowing || + capability == teetypes.CapGetFollowers || + capability == teetypes.CapGetRetweeters +} + +func (t *TwitterSearchArguments) IsSingleSpaceOperation() bool { + capability := t.GetCapability() + return capability == teetypes.CapGetSpace +} + +func (t *TwitterSearchArguments) IsTrendsOperation() bool { + capability := t.GetCapability() + return capability == teetypes.CapGetTrends } diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 2fee88d..79e273a 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -17,8 +17,12 @@ type JobArgumentsInterface interface { type TwitterJobArgumentsInterface interface { JobArgumentsInterface ValidateForJobType(jobType types.JobType) error - IsNonTweetOperation() bool IsSingleTweetOperation() bool + IsMultipleTweetOperation() bool + IsSingleProfileOperation() bool + IsMultipleProfileOperation() bool + IsSingleSpaceOperation() bool + IsTrendsOperation() bool } // WebJobArgumentsInterface extends JobArgumentsInterface for Web-specific methods @@ -96,19 +100,19 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi return twitterArgs, nil } -func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*LinkedInArguments, error) { - linkedInArgs := &LinkedInArguments{} - if err := unmarshalToStruct(args, linkedInArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) - } +// func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*LinkedInArguments, error) { +// linkedInArgs := &LinkedInArguments{} +// if err := unmarshalToStruct(args, linkedInArgs); err != nil { +// return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) +// } - // Perform job-type-specific validation for LinkedIn - if err := linkedInArgs.ValidateForJobType(jobType); err != nil { - return nil, fmt.Errorf("LinkedIn job validation failed: %w", err) - } +// // Perform job-type-specific validation for LinkedIn +// if err := linkedInArgs.ValidateForJobType(jobType); err != nil { +// return nil, fmt.Errorf("LinkedIn job validation failed: %w", err) +// } - return linkedInArgs, nil -} +// return linkedInArgs, nil +// } // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal // This provides the same functionality as the existing JobArguments.Unmarshal methods diff --git a/args/validation.go b/args/validation.go deleted file mode 100644 index b8b6e6e..0000000 --- a/args/validation.go +++ /dev/null @@ -1,72 +0,0 @@ -package args - -import ( - "fmt" - "slices" - - teetypes "github.com/masa-finance/tee-types/types" -) - -// jobCapabilityMap defines which capabilities are valid for each job type -var jobCapabilityMap = map[teetypes.JobType][]teetypes.Capability{ - // Twitter job types and their valid capabilities - teetypes.TwitterJob: append(append(append( - teetypes.TwitterCredentialCaps, - teetypes.TwitterAPICaps...), - teetypes.TwitterApifyCaps...), - teetypes.CapSearchByFullArchive, // Elevated API capability - ), - teetypes.TwitterCredentialJob: teetypes.TwitterCredentialCaps, - teetypes.TwitterApiJob: append( - teetypes.TwitterAPICaps, - teetypes.CapSearchByFullArchive, // Elevated API capability - ), - teetypes.TwitterApifyJob: teetypes.TwitterApifyCaps, - - // Web job capabilities - teetypes.WebJob: teetypes.AlwaysAvailableWebCaps, - - // TikTok job capabilities - teetypes.TiktokJob: teetypes.AlwaysAvailableTiktokCaps, - - // Telemetry job capabilities - teetypes.TelemetryJob: teetypes.AlwaysAvailableTelemetryCaps, -} - -// ValidateCapabilityForJobType validates that a capability is supported for the given job type -func ValidateCapabilityForJobType(jobType teetypes.JobType, capability teetypes.Capability) error { - if capability == "" { - // Empty capability is allowed for some job types - return nil - } - - validCaps, exists := jobCapabilityMap[jobType] - if !exists { - return fmt.Errorf("unknown job type: %s", jobType) - } - - if !slices.Contains(validCaps, capability) { - return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", - capability, jobType, validCaps) - } - - return nil -} - -// GetValidCapabilitiesForJobType returns all valid capabilities for a given job type -func GetValidCapabilitiesForJobType(jobType teetypes.JobType) ([]teetypes.Capability, error) { - validCaps, exists := jobCapabilityMap[jobType] - if !exists { - return nil, fmt.Errorf("unknown job type: %s", jobType) - } - - // Return a copy to prevent external modification - result := make([]teetypes.Capability, len(validCaps)) - copy(result, validCaps) - return result, nil -} - -// IsCapabilityValidForJobType checks if a capability is valid for a job type without returning an error -func IsCapabilityValidForJobType(jobType teetypes.JobType, capability teetypes.Capability) bool { - return ValidateCapabilityForJobType(jobType, capability) == nil -} diff --git a/args/web.go b/args/web.go index 8318278..fea71e0 100644 --- a/args/web.go +++ b/args/web.go @@ -75,7 +75,7 @@ func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error // GetCapability returns the capability for web operations (always scraper) func (w *WebSearchArguments) GetCapability() teetypes.Capability { - return teetypes.CapWebScraper + return teetypes.CapScraper } // IsDeepScrape returns true if this is a deep scraping operation diff --git a/types/jobs.go b/types/jobs.go index 612cc76..576854d 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -1,23 +1,37 @@ package types -type Capability string - -// JobType represents the type of job that can be executed type JobType string +type Capability string +type WorkerCapabilities map[JobType][]Capability -// Job type constants - centralized from tee-indexer and tee-worker -const ( - // Web scraping job type - WebJob JobType = "web" +// String returns the string representation of the JobType +func (j JobType) String() string { + return string(j) +} - // Telemetry job type for worker monitoring and stats - TelemetryJob JobType = "telemetry" +// combineCapabilities combines multiple capability slices and ensures uniqueness +func combineCapabilities(capSlices ...[]Capability) []Capability { + seen := make(map[Capability]bool) + var result []Capability + + for _, capSlice := range capSlices { + for _, cap := range capSlice { + if !seen[cap] { + seen[cap] = true + result = append(result, cap) + } + } + } - // TikTok transcription job type - TiktokJob JobType = "tiktok" + return result +} - // Twitter job types - TwitterJob JobType = "twitter" // General Twitter scraping (uses best available auth) +// Job type constants - centralized from tee-indexer and tee-worker +const ( + WebJob JobType = "web" + TelemetryJob JobType = "telemetry" + TiktokJob JobType = "tiktok" + TwitterJob JobType = "twitter" // General Twitter scraping (uses best available auth for capability) TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify @@ -25,16 +39,9 @@ const ( // Capability constants - typed to prevent typos and enable discoverability const ( - // Web scraping capabilities - CapWebScraper Capability = "scraper" - - // Telemetry capabilities - CapTelemetry Capability = "telemetry" - - // TikTok capabilities - CapTiktokTranscription Capability = "transcription" - - // Twitter capabilities + CapScraper Capability = "scraper" + CapTelemetry Capability = "telemetry" + CapTranscription Capability = "transcription" CapSearchByQuery Capability = "searchbyquery" CapSearchByFullArchive Capability = "searchbyfullarchive" CapSearchByProfile Capability = "searchbyprofile" @@ -54,14 +61,16 @@ const ( // Capability group constants for easy reuse var ( - // AlwaysAvailableWebCaps are web capabilities always available - AlwaysAvailableWebCaps = []Capability{CapWebScraper} - - // AlwaysAvailableTelemetryCaps are telemetry capabilities always available + AlwaysAvailableWebCaps = []Capability{CapScraper} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry} + AlwaysAvailableTiktokCaps = []Capability{CapTranscription} - // AlwaysAvailableTiktokCaps are TikTok capabilities always available - AlwaysAvailableTiktokCaps = []Capability{CapTiktokTranscription} + // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration + AlwaysAvailableCapabilities = WorkerCapabilities{ + WebJob: AlwaysAvailableWebCaps, + TelemetryJob: AlwaysAvailableTelemetryCaps, + TiktokJob: AlwaysAvailableTiktokCaps, + } // TwitterCredentialCaps are all Twitter capabilities available with credential-based auth TwitterCredentialCaps = []Capability{ @@ -76,20 +85,41 @@ var ( // TwitterApifyCaps are Twitter capabilities available with Apify TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing} - - // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration - AlwaysAvailableCapabilities = WorkerCapabilities{ - WebJob: AlwaysAvailableWebCaps, - TelemetryJob: AlwaysAvailableTelemetryCaps, - TiktokJob: AlwaysAvailableTiktokCaps, - } ) -// String returns the string representation of the JobType -func (j JobType) String() string { - return string(j) +// JobCapabilityMap defines which capabilities are valid for each job type +var JobCapabilityMap = map[JobType][]Capability{ + // Twitter job types and their valid capabilities + TwitterJob: combineCapabilities( + TwitterCredentialCaps, + TwitterAPICaps, + TwitterApifyCaps, + []Capability{CapSearchByFullArchive}, // Elevated API capability + ), + TwitterCredentialJob: TwitterCredentialCaps, + TwitterApiJob: combineCapabilities( + TwitterAPICaps, + []Capability{CapSearchByFullArchive}, // Elevated API capability + ), + TwitterApifyJob: TwitterApifyCaps, + + // Web job capabilities + WebJob: AlwaysAvailableWebCaps, + + // TikTok job capabilities + TiktokJob: AlwaysAvailableTiktokCaps, + + // Telemetry job capabilities + TelemetryJob: AlwaysAvailableTelemetryCaps, } -// WorkerCapabilities represents all capabilities available on a worker -// Maps JobType to the list of capabilities available for that job type -type WorkerCapabilities map[JobType][]Capability +// if no capability is specified, use the default capability for the job type +var JobDefaultCapabilityMap = map[JobType]Capability{ + TwitterJob: CapSearchByQuery, + TwitterCredentialJob: CapSearchByQuery, + TwitterApiJob: CapSearchByQuery, + TwitterApifyJob: CapGetFollowing, + WebJob: CapScraper, + TiktokJob: CapTranscription, + TelemetryJob: CapTelemetry, +} From 9d6bbebaf4c7d5d94fe40fca663924337f5c5c74 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 20:43:30 +0200 Subject: [PATCH 030/136] fix: default apify to get followers --- types/jobs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types/jobs.go b/types/jobs.go index 576854d..1ef8efb 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -118,7 +118,7 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterJob: CapSearchByQuery, TwitterCredentialJob: CapSearchByQuery, TwitterApiJob: CapSearchByQuery, - TwitterApifyJob: CapGetFollowing, + TwitterApifyJob: CapGetFollowers, WebJob: CapScraper, TiktokJob: CapTranscription, TelemetryJob: CapTelemetry, From 6622c361c0b8e7486922190fe5268f84f078dae9 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 21:44:58 +0200 Subject: [PATCH 031/136] chore: default capability with twitter --- args/unmarshaller.go | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 79e273a..7308411 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -92,6 +92,13 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi return nil, fmt.Errorf("failed to unmarshal Twitter job arguments: %w", err) } + // If no QueryType is specified, use the default capability for this job type + if twitterArgs.QueryType == "" { + if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { + twitterArgs.QueryType = string(defaultCap) + } + } + // Perform job-type-specific validation for Twitter if err := twitterArgs.ValidateForJobType(jobType); err != nil { return nil, fmt.Errorf("Twitter job validation failed: %w", err) @@ -100,20 +107,6 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi return twitterArgs, nil } -// func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*LinkedInArguments, error) { -// linkedInArgs := &LinkedInArguments{} -// if err := unmarshalToStruct(args, linkedInArgs); err != nil { -// return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) -// } - -// // Perform job-type-specific validation for LinkedIn -// if err := linkedInArgs.ValidateForJobType(jobType); err != nil { -// return nil, fmt.Errorf("LinkedIn job validation failed: %w", err) -// } - -// return linkedInArgs, nil -// } - // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal // This provides the same functionality as the existing JobArguments.Unmarshal methods func unmarshalToStruct(args map[string]any, target any) error { From 069424e8f9a0b8985192626c2d2d69dc0c0bada8 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 22:02:04 +0200 Subject: [PATCH 032/136] fix: tiktok validation --- args/tiktok.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index cffdbac..a9d4bdf 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -28,10 +28,10 @@ func (t *TikTokTranscriptionArguments) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) } - // Normalize language to lowercase if provided - if t.Language != "" { - t.Language = strings.ToLower(t.Language) - } + // // Normalize language to lowercase if provided + // if t.Language != "" { + // t.Language = strings.ToLower(t.Language) + // } return t.Validate() } @@ -102,14 +102,15 @@ func (t *TikTokTranscriptionArguments) ValidateForJobType(jobType teetypes.JobTy // validateLanguageCode validates the language code format func (t *TikTokTranscriptionArguments) validateLanguageCode() error { - // Basic validation for language codes like "en-us", "es-es", etc. + // Basic validation for language codes like "en-us", "eng-us", "es-es", etc. parts := strings.Split(t.Language, "-") if len(parts) != 2 { - return fmt.Errorf("invalid language format '%s', expected format: 'lang-region' (e.g., 'en-us')", t.Language) + return fmt.Errorf("invalid language format '%s', expected format: 'lang-region' (e.g., 'en-us' or 'eng-us')", t.Language) } - if len(parts[0]) != 2 || len(parts[1]) != 2 { - return fmt.Errorf("invalid language format '%s', expected 2-letter language and region codes", t.Language) + // Language code can be 2 or 3 letters, region must be 2 letters + if (len(parts[0]) != 2 && len(parts[0]) != 3) || len(parts[1]) != 2 { + return fmt.Errorf("invalid language format '%s', expected 2-3 letter language code and 2-letter region code", t.Language) } return nil From df61fd0aacc61130242453b3d227080ac3e5b42b Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 31 Jul 2025 22:17:12 +0200 Subject: [PATCH 033/136] fix: language validation --- args/tiktok.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/tiktok.go b/args/tiktok.go index a9d4bdf..bce3e73 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -85,7 +85,7 @@ func (t *TikTokTranscriptionArguments) HasLanguagePreference() bool { // GetLanguageCode returns the language code, defaulting to "en-us" if not specified func (t *TikTokTranscriptionArguments) GetLanguageCode() string { if t.Language == "" { - return "en-us" + return "en-US" } return t.Language } From 33dabccdae9b5d3946bfd70259b8bd8a94c355b0 Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 15:41:46 +0200 Subject: [PATCH 034/136] Add Min and Max utility functions, and the Set type --- .github/dependabot.yml | 11 +++ .github/workflows/build.yaml | 19 ++++ .github/workflows/images.yml | 170 +++++++++++++++++++++++++++++++++++ .github/workflows/tests.yaml | 56 ++++++++++++ .gitignore | 1 + args/tiktok.go | 2 +- go.mod | 22 +++++ go.sum | 43 +++++++++ pkg/util/math.go | 27 ++++++ pkg/util/math_test.go | 24 +++++ pkg/util/set.go | 77 ++++++++++++++++ pkg/util/set_test.go | 63 +++++++++++++ types/tiktok.go | 2 +- 13 files changed, 515 insertions(+), 2 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/build.yaml create mode 100644 .github/workflows/images.yml create mode 100644 .github/workflows/tests.yaml create mode 100644 .gitignore create mode 100644 pkg/util/math.go create mode 100644 pkg/util/math_test.go create mode 100644 pkg/util/set.go create mode 100644 pkg/util/set_test.go diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..cd88554 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + - package-ecosystem: "gomod" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..968f535 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,19 @@ +name: Build + +on: + pull_request: + +jobs: + build-pr: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Golang with cache + uses: magnetikonline/action-golang-cache@v5 + with: + go-version-file: go.mod + + - name: Build Binaries + run: | + make build diff --git a/.github/workflows/images.yml b/.github/workflows/images.yml new file mode 100644 index 0000000..27708a0 --- /dev/null +++ b/.github/workflows/images.yml @@ -0,0 +1,170 @@ +--- + name: 'build container images' + + on: + push: + branches: + - master + - main + tags: + - '*' + concurrency: + group: ci-image-${{ github.head_ref || github.ref }}-${{ github.repository }} + cancel-in-progress: true + jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Cache Docker layers + uses: actions/cache@v3 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx-main + + - name: Prepare + id: prep + run: | + DOCKER_IMAGE=masaengineering/tee-indexer + # Use branch name as default + VERSION=${GITHUB_REF#refs/heads/} + BINARY_VERSION=$(git describe --always --tags --dirty) + SHORTREF=${GITHUB_SHA::8} + # If this is git tag, use the tag name as a docker tag + if [[ $GITHUB_REF == refs/tags/* ]]; then + VERSION=${GITHUB_REF#refs/tags/} + fi + TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}" + # If the VERSION looks like a version number, assume that + # this is the most recent version of the image and also + # tag it 'latest'. + if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then + TAGS="$TAGS,${DOCKER_IMAGE}:latest" + fi + # Set output parameters. + echo ::set-output name=binary_version::${BINARY_VERSION} + echo ::set-output name=tags::${TAGS} + echo ::set-output name=docker_image::${DOCKER_IMAGE} + + - name: Set up QEMU + uses: docker/setup-qemu-action@master + with: + platforms: all + + - name: Login to DockerHub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@2a4836ac76fe8f5d0ee3a0d89aa12a80cc552ad3 + with: + images: masaengineering/tee-indexer + tags: | + type=ref,event=branch,suffix=-{{date 'YYYYMMDDHHmmss'}} + type=semver,pattern={{raw}} + type=sha,suffix=-{{date 'YYYYMMDDHHmmss'}} + type=ref,event=branch + flavor: | + latest=auto + prefix= + suffix= + + - name: Build + uses: docker/build-push-action@v6 + with: + builder: ${{ steps.buildx.outputs.name }} + build-args: | + VERSION=${{ steps.prep.outputs.binary_version }} + context: ./ + file: ./Dockerfile + platforms: linux/amd64 + push: true + #tags: ${{ steps.prep.outputs.tags }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + docker-broker: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Prepare + id: prep + run: | + DOCKER_IMAGE=masaengineering/tee-keybroker + # Use branch name as default + VERSION=${GITHUB_REF#refs/heads/} + BINARY_VERSION=$(git describe --always --tags --dirty) + SHORTREF=${GITHUB_SHA::8} + # If this is git tag, use the tag name as a docker tag + if [[ $GITHUB_REF == refs/tags/* ]]; then + VERSION=${GITHUB_REF#refs/tags/} + fi + TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}" + # If the VERSION looks like a version number, assume that + # this is the most recent version of the image and also + # tag it 'latest'. + if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then + TAGS="$TAGS,${DOCKER_IMAGE}:latest" + fi + # Set output parameters. + echo ::set-output name=binary_version::${BINARY_VERSION} + echo ::set-output name=tags::${TAGS} + echo ::set-output name=docker_image::${DOCKER_IMAGE} + + - name: Set up QEMU + uses: docker/setup-qemu-action@master + with: + platforms: all + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@master + + - name: Login to DockerHub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@2a4836ac76fe8f5d0ee3a0d89aa12a80cc552ad3 + with: + images: masaengineering/tee-keybroker + tags: | + type=ref,event=branch,suffix=-{{date 'YYYYMMDDHHmmss'}} + type=semver,pattern={{raw}} + type=sha,suffix=-{{date 'YYYYMMDDHHmmss'}} + type=ref,event=branch + flavor: | + latest=auto + prefix= + suffix= + + - name: Build + uses: docker/build-push-action@v6 + with: + builder: ${{ steps.buildx.outputs.name }} + build-args: | + VERSION=${{ steps.prep.outputs.binary_version }} + context: ./ + file: ./Dockerfile.keybroker + platforms: linux/amd64 + push: true + #tags: ${{ steps.prep.outputs.tags }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 0000000..31b72c5 --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,56 @@ +name: Run Go Tests + +on: + push: + branches: + - '**' + pull_request: + branches: + - '**' + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - name: Install golangci-lint + run: sudo snap install golangci-lint --classic + + - name: Checkout code + uses: actions/checkout@v2 + + + - name: Setup Golang with cache + uses: magnetikonline/action-golang-cache@v5 + with: + go-version-file: go.mod + + - name: Install dependencies + run: | + go mod tidy + go install gofmt + + - name: Run tests + run: | + go mod tidy && git diff --exit-code + go mod download + go mod verify + gofmt -s -w . && git diff --exit-code + go vet ./... + golangci-lint run + + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Setup Golang with cache + uses: magnetikonline/action-golang-cache@v5 + with: + go-version-file: go.mod + + - name: Run unit tests + run: | + go test ./... diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b0ac3ed --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.aider* diff --git a/args/tiktok.go b/args/tiktok.go index 9a082d9..c55fe7e 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -4,4 +4,4 @@ package args type TikTokTranscriptionArguments struct { VideoURL string `json:"video_url"` Language string `json:"language,omitempty"` // e.g., "eng-US" -} \ No newline at end of file +} diff --git a/go.mod b/go.mod index 553d075..a578211 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,25 @@ module github.com/masa-finance/tee-types go 1.23.0 + +require ( + github.com/onsi/gomega v1.38.0 + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 +) + +require ( + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect + go.uber.org/automaxprocs v1.6.0 // indirect + golang.org/x/sys v0.34.0 // indirect + golang.org/x/tools v0.35.0 // indirect +) + +require ( + github.com/google/go-cmp v0.7.0 // indirect + github.com/onsi/ginkgo/v2 v2.23.4 + golang.org/x/net v0.42.0 // indirect + golang.org/x/text v0.27.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum index e69de29..9d2dc6c 100644 --- a/go.sum +++ b/go.sum @@ -0,0 +1,43 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY= +github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= +golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= +golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= +golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= +golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= +golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= +golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= +golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/util/math.go b/pkg/util/math.go new file mode 100644 index 0000000..4900fb6 --- /dev/null +++ b/pkg/util/math.go @@ -0,0 +1,27 @@ +package util + +import "golang.org/x/exp/constraints" + +func Min[T constraints.Ordered](elements ...T) T { + ret := elements[0] + + for _, x := range elements { + if x < ret { + x = ret + } + } + + return ret +} + +func Max[T constraints.Ordered](elements ...T) T { + ret := elements[0] + + for _, x := range elements { + if x > ret { + x = ret + } + } + + return ret +} diff --git a/pkg/util/math_test.go b/pkg/util/math_test.go new file mode 100644 index 0000000..99c82c9 --- /dev/null +++ b/pkg/util/math_test.go @@ -0,0 +1,24 @@ +package util_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/pkg/util" +) + +var _ = Describe("Math functions", func() { + Describe("Min", func() { + It("should calculate the minimum of a series of orderable values regardless of parameter order", func() { + Expect(util.Min(1, 2, 3, 4, 5, 6)).To(Equal(1)) + Expect(util.Min(2, 3, 8, -1, 4, 42)).To(Equal(-1)) + }) + }) + + Describe("Max", func() { + It("should calculate the maximum of a series of orderable values regardless of parameter order", func() { + Expect(util.Max(1, 2, 3, 4, 5, 6)).To(Equal(6)) + Expect(util.Max(2, 3, 8, -12, 4, 42)).To(Equal(42)) + }) + }) +}) diff --git a/pkg/util/set.go b/pkg/util/set.go new file mode 100644 index 0000000..cb15d6a --- /dev/null +++ b/pkg/util/set.go @@ -0,0 +1,77 @@ +package util + +import ( + "iter" + "maps" + + xmaps "golang.org/x/exp/maps" +) + +type Set[T comparable] map[T]struct{} + +func NewSet[T comparable](capacity uint, items ...T) Set[T] { + capacity = Max(capacity, uint(len(items))) + + ret := make(Set[T], capacity) + for _, x := range items { + ret.Add(x) + } + return ret +} + +func (s *Set[T]) Contains(item T) bool { + _, exists := (*s)[item] + return exists +} + +func (s *Set[T]) Add(items ...T) { + for _, item := range items { + (*s)[item] = struct{}{} + } +} + +func (s *Set[T]) Delete(items ...T) { + for _, item := range items { + delete(*s, item) + } +} + +func (s *Set[T]) Items() []T { + return xmaps.Keys(*s) +} + +func (s *Set[T]) ItemsSeq() iter.Seq[T] { + return maps.Keys(*s) +} + +func (s *Set[T]) Union(s2 *Set[T]) *Set[T] { + ret := make(Set[T], len(*s)+len(*s2)) + for k := range *s { + ret.Add(k) + } + for k := range *s2 { + ret.Add(k) + } + return &ret +} + +func (s *Set[T]) Intersection(s2 *Set[T]) *Set[T] { + ret := make(Set[T], len(*s)+len(*s2)) + for k := range *s { + if s2.Contains(k) { + ret.Add(k) + } + } + return &ret +} + +func (s *Set[T]) Difference(s2 *Set[T]) *Set[T] { + ret := make(Set[T], len(*s)+len(*s2)) + for k := range *s { + if !s2.Contains(k) { + ret.Add(k) + } + } + + return &ret +} diff --git a/pkg/util/set_test.go b/pkg/util/set_test.go new file mode 100644 index 0000000..e1b6364 --- /dev/null +++ b/pkg/util/set_test.go @@ -0,0 +1,63 @@ +package util_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/pkg/util" +) + +var _ = Describe("Set", func() { + It("should return a slice of all its elements", func() { + s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + Expect(s.Items()).To(ConsistOf(0, 1, 2, 3, 4, 5, 6)) + }) + + It("should check whether an item is included in the Set or not", func() { + s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + Expect(s.Contains(2)).To(BeTrue()) + Expect(s.Contains(42)).To(BeFalse()) + }) + + It("should add items to the set without duplicating", func() { + s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + s.Add(7, 8, 9, 2, 4) + Expect(s).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) + }) + + It("should delete items from the set if they exist", func() { + s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + s.Delete(7, 8, 9, 2, 4, 42) + Expect(s).To(ConsistOf(0, 1, 3, 5, 6)) + }) + + It("should return a sequence of all its elements", func() { + s := util.NewSet(0, 0, 1, 2, 3, 4, 5, 6) + items := make([]int, 0) + for item := range s.ItemsSeq() { + items = append(items, item) + } + Expect(items).To(ConsistOf(0, 1, 2, 3, 4, 5, 6)) + }) + + It("should return the union of two sets", func() { + s1 := util.NewSet(0, 0, 1, 2, 3, 4) + s2 := util.NewSet(0, 3, 4, 5, 6, 7) + s3 := s1.Union(&s2) + Expect(*s3).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7)) + }) + + It("should return the intersection of two sets", func() { + s1 := util.NewSet(0, 0, 1, 2, 3, 4) + s2 := util.NewSet(0, 3, 4, 5, 6, 7) + s3 := s1.Intersection(&s2) + Expect(*s3).To(ConsistOf(3, 4)) + }) + + It("should return the difference of two sets", func() { + s1 := util.NewSet(0, 0, 1, 2, 3, 4) + s2 := util.NewSet(0, 3, 4, 5, 6, 7) + s3 := s1.Difference(&s2) + Expect(*s3).To(ConsistOf(0, 1, 2)) + }) +}) diff --git a/types/tiktok.go b/types/tiktok.go index 39ceac4..94f9f65 100644 --- a/types/tiktok.go +++ b/types/tiktok.go @@ -8,4 +8,4 @@ type TikTokTranscriptionResult struct { VideoTitle string `json:"video_title,omitempty"` OriginalURL string `json:"original_url"` ThumbnailURL string `json:"thumbnail_url,omitempty"` -} \ No newline at end of file +} From 3d3669190d3aed606a4ef6bcedce004a7e66ec1e Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 15:47:19 +0200 Subject: [PATCH 035/136] Remove unneeded GH Actions workflows --- .github/workflows/build.yaml | 19 ---- .github/workflows/images.yml | 170 ----------------------------------- .github/workflows/tests.yaml | 2 - 3 files changed, 191 deletions(-) delete mode 100644 .github/workflows/build.yaml delete mode 100644 .github/workflows/images.yml diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml deleted file mode 100644 index 968f535..0000000 --- a/.github/workflows/build.yaml +++ /dev/null @@ -1,19 +0,0 @@ -name: Build - -on: - pull_request: - -jobs: - build-pr: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Setup Golang with cache - uses: magnetikonline/action-golang-cache@v5 - with: - go-version-file: go.mod - - - name: Build Binaries - run: | - make build diff --git a/.github/workflows/images.yml b/.github/workflows/images.yml deleted file mode 100644 index 27708a0..0000000 --- a/.github/workflows/images.yml +++ /dev/null @@ -1,170 +0,0 @@ ---- - name: 'build container images' - - on: - push: - branches: - - master - - main - tags: - - '*' - concurrency: - group: ci-image-${{ github.head_ref || github.ref }}-${{ github.repository }} - cancel-in-progress: true - jobs: - docker: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Cache Docker layers - uses: actions/cache@v3 - with: - path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ github.sha }} - restore-keys: | - ${{ runner.os }}-buildx-main - - - name: Prepare - id: prep - run: | - DOCKER_IMAGE=masaengineering/tee-indexer - # Use branch name as default - VERSION=${GITHUB_REF#refs/heads/} - BINARY_VERSION=$(git describe --always --tags --dirty) - SHORTREF=${GITHUB_SHA::8} - # If this is git tag, use the tag name as a docker tag - if [[ $GITHUB_REF == refs/tags/* ]]; then - VERSION=${GITHUB_REF#refs/tags/} - fi - TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}" - # If the VERSION looks like a version number, assume that - # this is the most recent version of the image and also - # tag it 'latest'. - if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then - TAGS="$TAGS,${DOCKER_IMAGE}:latest" - fi - # Set output parameters. - echo ::set-output name=binary_version::${BINARY_VERSION} - echo ::set-output name=tags::${TAGS} - echo ::set-output name=docker_image::${DOCKER_IMAGE} - - - name: Set up QEMU - uses: docker/setup-qemu-action@master - with: - platforms: all - - - name: Login to DockerHub - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@2a4836ac76fe8f5d0ee3a0d89aa12a80cc552ad3 - with: - images: masaengineering/tee-indexer - tags: | - type=ref,event=branch,suffix=-{{date 'YYYYMMDDHHmmss'}} - type=semver,pattern={{raw}} - type=sha,suffix=-{{date 'YYYYMMDDHHmmss'}} - type=ref,event=branch - flavor: | - latest=auto - prefix= - suffix= - - - name: Build - uses: docker/build-push-action@v6 - with: - builder: ${{ steps.buildx.outputs.name }} - build-args: | - VERSION=${{ steps.prep.outputs.binary_version }} - context: ./ - file: ./Dockerfile - platforms: linux/amd64 - push: true - #tags: ${{ steps.prep.outputs.tags }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - - docker-broker: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Prepare - id: prep - run: | - DOCKER_IMAGE=masaengineering/tee-keybroker - # Use branch name as default - VERSION=${GITHUB_REF#refs/heads/} - BINARY_VERSION=$(git describe --always --tags --dirty) - SHORTREF=${GITHUB_SHA::8} - # If this is git tag, use the tag name as a docker tag - if [[ $GITHUB_REF == refs/tags/* ]]; then - VERSION=${GITHUB_REF#refs/tags/} - fi - TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}" - # If the VERSION looks like a version number, assume that - # this is the most recent version of the image and also - # tag it 'latest'. - if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then - TAGS="$TAGS,${DOCKER_IMAGE}:latest" - fi - # Set output parameters. - echo ::set-output name=binary_version::${BINARY_VERSION} - echo ::set-output name=tags::${TAGS} - echo ::set-output name=docker_image::${DOCKER_IMAGE} - - - name: Set up QEMU - uses: docker/setup-qemu-action@master - with: - platforms: all - - - name: Set up Docker Buildx - id: buildx - uses: docker/setup-buildx-action@master - - - name: Login to DockerHub - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@2a4836ac76fe8f5d0ee3a0d89aa12a80cc552ad3 - with: - images: masaengineering/tee-keybroker - tags: | - type=ref,event=branch,suffix=-{{date 'YYYYMMDDHHmmss'}} - type=semver,pattern={{raw}} - type=sha,suffix=-{{date 'YYYYMMDDHHmmss'}} - type=ref,event=branch - flavor: | - latest=auto - prefix= - suffix= - - - name: Build - uses: docker/build-push-action@v6 - with: - builder: ${{ steps.buildx.outputs.name }} - build-args: | - VERSION=${{ steps.prep.outputs.binary_version }} - context: ./ - file: ./Dockerfile.keybroker - platforms: linux/amd64 - push: true - #tags: ${{ steps.prep.outputs.tags }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 31b72c5..ad34dad 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -19,7 +19,6 @@ jobs: - name: Checkout code uses: actions/checkout@v2 - - name: Setup Golang with cache uses: magnetikonline/action-golang-cache@v5 with: @@ -28,7 +27,6 @@ jobs: - name: Install dependencies run: | go mod tidy - go install gofmt - name: Run tests run: | From 33f734768f2fb54a49ddd06bf4032e5946c516b3 Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 15:53:31 +0200 Subject: [PATCH 036/136] Fix --- pkg/util/math.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/util/math.go b/pkg/util/math.go index 4900fb6..4e6f971 100644 --- a/pkg/util/math.go +++ b/pkg/util/math.go @@ -7,7 +7,7 @@ func Min[T constraints.Ordered](elements ...T) T { for _, x := range elements { if x < ret { - x = ret + ret = x } } @@ -19,7 +19,7 @@ func Max[T constraints.Ordered](elements ...T) T { for _, x := range elements { if x > ret { - x = ret + ret = x } } From a798bbab38ef39c5e04451130f62686646bfb8cb Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 16:03:35 +0200 Subject: [PATCH 037/136] Remove dependency on x/exp --- pkg/util/set.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/util/set.go b/pkg/util/set.go index cb15d6a..d7d2e4d 100644 --- a/pkg/util/set.go +++ b/pkg/util/set.go @@ -3,8 +3,7 @@ package util import ( "iter" "maps" - - xmaps "golang.org/x/exp/maps" + "slices" ) type Set[T comparable] map[T]struct{} @@ -37,7 +36,7 @@ func (s *Set[T]) Delete(items ...T) { } func (s *Set[T]) Items() []T { - return xmaps.Keys(*s) + return slices.Collect(s.ItemsSeq()) } func (s *Set[T]) ItemsSeq() iter.Seq[T] { From 382b9a66f8f78ab8353dd128302a3060daf50c12 Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 16:07:31 +0200 Subject: [PATCH 038/136] Optimize the Set capacities a bit --- pkg/util/set.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/util/set.go b/pkg/util/set.go index d7d2e4d..726a535 100644 --- a/pkg/util/set.go +++ b/pkg/util/set.go @@ -55,7 +55,7 @@ func (s *Set[T]) Union(s2 *Set[T]) *Set[T] { } func (s *Set[T]) Intersection(s2 *Set[T]) *Set[T] { - ret := make(Set[T], len(*s)+len(*s2)) + ret := make(Set[T], Min(len(*s), len(*s2))) for k := range *s { if s2.Contains(k) { ret.Add(k) @@ -65,7 +65,7 @@ func (s *Set[T]) Intersection(s2 *Set[T]) *Set[T] { } func (s *Set[T]) Difference(s2 *Set[T]) *Set[T] { - ret := make(Set[T], len(*s)+len(*s2)) + ret := make(Set[T], len(*s)) for k := range *s { if !s2.Contains(k) { ret.Add(k) From 5365f62812dbfdfaf8ed054af2d88ce62ef975dc Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 17:09:07 +0200 Subject: [PATCH 039/136] Union can receive multiple Sets, and NewSet returns a reference --- pkg/util/set.go | 23 +++++++++++++++++------ pkg/util/set_test.go | 35 ++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/pkg/util/set.go b/pkg/util/set.go index 726a535..e710d09 100644 --- a/pkg/util/set.go +++ b/pkg/util/set.go @@ -8,14 +8,14 @@ import ( type Set[T comparable] map[T]struct{} -func NewSet[T comparable](capacity uint, items ...T) Set[T] { +func NewSet[T comparable](capacity uint, items ...T) *Set[T] { capacity = Max(capacity, uint(len(items))) ret := make(Set[T], capacity) for _, x := range items { ret.Add(x) } - return ret + return &ret } func (s *Set[T]) Contains(item T) bool { @@ -35,6 +35,10 @@ func (s *Set[T]) Delete(items ...T) { } } +func (s *Set[T]) Length() int { + return len(*s) +} + func (s *Set[T]) Items() []T { return slices.Collect(s.ItemsSeq()) } @@ -43,13 +47,20 @@ func (s *Set[T]) ItemsSeq() iter.Seq[T] { return maps.Keys(*s) } -func (s *Set[T]) Union(s2 *Set[T]) *Set[T] { - ret := make(Set[T], len(*s)+len(*s2)) +func (s *Set[T]) Union(sets ...*Set[T]) *Set[T] { + sum := len(*s) + for _, ss := range sets { + sum = sum + len(*ss) + + } + ret := make(Set[T], sum) for k := range *s { ret.Add(k) } - for k := range *s2 { - ret.Add(k) + for _, ss := range sets { + for k := range *ss { + ret.Add(k) + } } return &ret } diff --git a/pkg/util/set_test.go b/pkg/util/set_test.go index e1b6364..9303808 100644 --- a/pkg/util/set_test.go +++ b/pkg/util/set_test.go @@ -1,6 +1,8 @@ package util_test import ( + "slices" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -22,42 +24,49 @@ var _ = Describe("Set", func() { It("should add items to the set without duplicating", func() { s := util.NewSet(0, 1, 2, 3, 4, 5, 6) s.Add(7, 8, 9, 2, 4) - Expect(s).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) + Expect(s.Items()).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) }) It("should delete items from the set if they exist", func() { s := util.NewSet(0, 1, 2, 3, 4, 5, 6) s.Delete(7, 8, 9, 2, 4, 42) - Expect(s).To(ConsistOf(0, 1, 3, 5, 6)) + Expect(s.Items()).To(ConsistOf(0, 1, 3, 5, 6)) + }) + + It("should return the number of items in the set", func() { + s := util.NewSet(0, 0, 1, 2, 3, 4, 5, 6) + Expect(s.Length()).To(Equal(7)) + s.Add(7, 8, 9, 2, 4) + Expect(s.Length()).To(Equal(10)) + s.Delete(0, 1) + Expect(s.Length()).To(Equal(8)) }) It("should return a sequence of all its elements", func() { s := util.NewSet(0, 0, 1, 2, 3, 4, 5, 6) - items := make([]int, 0) - for item := range s.ItemsSeq() { - items = append(items, item) - } + items := slices.Collect(s.ItemsSeq()) Expect(items).To(ConsistOf(0, 1, 2, 3, 4, 5, 6)) }) - It("should return the union of two sets", func() { + It("should return the union of multiple sets", func() { s1 := util.NewSet(0, 0, 1, 2, 3, 4) s2 := util.NewSet(0, 3, 4, 5, 6, 7) - s3 := s1.Union(&s2) - Expect(*s3).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7)) + s3 := util.NewSet(0, 8, 9, 0) + s4 := s1.Union(s2, s3) + Expect(s4.Items()).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) }) It("should return the intersection of two sets", func() { s1 := util.NewSet(0, 0, 1, 2, 3, 4) s2 := util.NewSet(0, 3, 4, 5, 6, 7) - s3 := s1.Intersection(&s2) - Expect(*s3).To(ConsistOf(3, 4)) + s3 := s1.Intersection(s2) + Expect(s3.Items()).To(ConsistOf(3, 4)) }) It("should return the difference of two sets", func() { s1 := util.NewSet(0, 0, 1, 2, 3, 4) s2 := util.NewSet(0, 3, 4, 5, 6, 7) - s3 := s1.Difference(&s2) - Expect(*s3).To(ConsistOf(0, 1, 2)) + s3 := s1.Difference(s2) + Expect(s3.Items()).To(ConsistOf(0, 1, 2)) }) }) From 08f0c8b9ff8ee9902e685e33d815feb6a98e6772 Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 17:23:09 +0200 Subject: [PATCH 040/136] More stuff in .gitignore --- .gitignore | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.gitignore b/.gitignore index b0ac3ed..9954081 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,12 @@ +*.out +*.log + +go.work + +# Backup and IDE-related files +.idea/** +.vscode/** +*~ +*.log +.DS_Store .aider* From 0d374a83ad96d949f33df5ff237562afa5bc04f1 Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 1 Aug 2025 18:46:18 +0200 Subject: [PATCH 041/136] Some more streamlining of the API --- pkg/util/set.go | 55 +++++++++++++++++++++++-------------- pkg/util/set_test.go | 31 +++++++++++---------- pkg/util/util_suite_test.go | 13 +++++++++ 3 files changed, 64 insertions(+), 35 deletions(-) create mode 100644 pkg/util/util_suite_test.go diff --git a/pkg/util/set.go b/pkg/util/set.go index e710d09..fe237be 100644 --- a/pkg/util/set.go +++ b/pkg/util/set.go @@ -6,82 +6,97 @@ import ( "slices" ) +// Set is a generic collection of unique items. type Set[T comparable] map[T]struct{} -func NewSet[T comparable](capacity uint, items ...T) *Set[T] { - capacity = Max(capacity, uint(len(items))) - - ret := make(Set[T], capacity) - for _, x := range items { - ret.Add(x) - } +// NewSet creates and returns a new Set with the given items, deduplicating them. +func NewSet[T comparable](items ...T) *Set[T] { + ret := make(Set[T], len(items)) + ret.Add(items...) return &ret } +// Contains checks if an item is present in the set. func (s *Set[T]) Contains(item T) bool { _, exists := (*s)[item] return exists } +// Add inserts the given items into the set, deduplicating them. func (s *Set[T]) Add(items ...T) { for _, item := range items { (*s)[item] = struct{}{} } } +// Delete removes the given items from the set if it contains them. func (s *Set[T]) Delete(items ...T) { for _, item := range items { - delete(*s, item) + delete((*s), item) } } +// Length returns the number of items in the set. func (s *Set[T]) Length() int { return len(*s) } +// Items returns a slice containing all the items in the set. +// The order of items in the slice is not guaranteed. func (s *Set[T]) Items() []T { return slices.Collect(s.ItemsSeq()) } +// ItemsSeq returns an iterator that yields all the items in the set. +// The order of items is not guaranteed. func (s *Set[T]) ItemsSeq() iter.Seq[T] { return maps.Keys(*s) } +// Union returns a new set containing all the items from the original set and all the provided sets, deduplicating them. func (s *Set[T]) Union(sets ...*Set[T]) *Set[T] { - sum := len(*s) + sum := s.Length() for _, ss := range sets { - sum = sum + len(*ss) + sum = sum + ss.Length() } - ret := make(Set[T], sum) + + ret := make(map[T]struct{}, sum) for k := range *s { - ret.Add(k) + ret[k] = struct{}{} } for _, ss := range sets { for k := range *ss { - ret.Add(k) + ret[k] = struct{}{} } } - return &ret + + rs := Set[T](ret) + return &rs } +// Intersection returns a new set containing only the items that are present in both the original set and s2. func (s *Set[T]) Intersection(s2 *Set[T]) *Set[T] { - ret := make(Set[T], Min(len(*s), len(*s2))) + ret := make(map[T]struct{}, Min(s.Length(), s2.Length())) for k := range *s { if s2.Contains(k) { - ret.Add(k) + ret[k] = struct{}{} } } - return &ret + + rs := Set[T](ret) + return &rs } +// Difference returns a new set containing items that are in the original set but not in s2. func (s *Set[T]) Difference(s2 *Set[T]) *Set[T] { - ret := make(Set[T], len(*s)) + ret := make(map[T]struct{}, s.Length()) for k := range *s { if !s2.Contains(k) { - ret.Add(k) + ret[k] = struct{}{} } } - return &ret + rs := Set[T](ret) + return &rs } diff --git a/pkg/util/set_test.go b/pkg/util/set_test.go index 9303808..b03fc42 100644 --- a/pkg/util/set_test.go +++ b/pkg/util/set_test.go @@ -11,30 +11,31 @@ import ( var _ = Describe("Set", func() { It("should return a slice of all its elements", func() { - s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + s := util.NewSet(0, 1, 2, 2, 2, 3, 4, 5, 5, 5, 5, 6) Expect(s.Items()).To(ConsistOf(0, 1, 2, 3, 4, 5, 6)) + Expect(s.Length()).To(Equal(7)) }) It("should check whether an item is included in the Set or not", func() { - s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + s := util.NewSet(1, 2, 3, 4, 5, 6) Expect(s.Contains(2)).To(BeTrue()) Expect(s.Contains(42)).To(BeFalse()) }) It("should add items to the set without duplicating", func() { - s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + s := util.NewSet(1, 2, 3, 4, 5, 6) s.Add(7, 8, 9, 2, 4) - Expect(s.Items()).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) + Expect(s.Items()).To(ConsistOf(1, 2, 3, 4, 5, 6, 7, 8, 9)) }) It("should delete items from the set if they exist", func() { - s := util.NewSet(0, 1, 2, 3, 4, 5, 6) + s := util.NewSet(1, 2, 3, 4, 5, 6) s.Delete(7, 8, 9, 2, 4, 42) - Expect(s.Items()).To(ConsistOf(0, 1, 3, 5, 6)) + Expect(s.Items()).To(ConsistOf(1, 3, 5, 6)) }) It("should return the number of items in the set", func() { - s := util.NewSet(0, 0, 1, 2, 3, 4, 5, 6) + s := util.NewSet(0, 1, 2, 3, 4, 5, 6) Expect(s.Length()).To(Equal(7)) s.Add(7, 8, 9, 2, 4) Expect(s.Length()).To(Equal(10)) @@ -43,29 +44,29 @@ var _ = Describe("Set", func() { }) It("should return a sequence of all its elements", func() { - s := util.NewSet(0, 0, 1, 2, 3, 4, 5, 6) + s := util.NewSet(0, 1, 2, 3, 4, 5, 6) items := slices.Collect(s.ItemsSeq()) Expect(items).To(ConsistOf(0, 1, 2, 3, 4, 5, 6)) }) It("should return the union of multiple sets", func() { - s1 := util.NewSet(0, 0, 1, 2, 3, 4) - s2 := util.NewSet(0, 3, 4, 5, 6, 7) - s3 := util.NewSet(0, 8, 9, 0) + s1 := util.NewSet(0, 1, 2, 3, 4) + s2 := util.NewSet(3, 4, 5, 6, 7) + s3 := util.NewSet(8, 9, 0) s4 := s1.Union(s2, s3) Expect(s4.Items()).To(ConsistOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) }) It("should return the intersection of two sets", func() { - s1 := util.NewSet(0, 0, 1, 2, 3, 4) - s2 := util.NewSet(0, 3, 4, 5, 6, 7) + s1 := util.NewSet(0, 1, 2, 3, 4) + s2 := util.NewSet(3, 4, 5, 6, 7) s3 := s1.Intersection(s2) Expect(s3.Items()).To(ConsistOf(3, 4)) }) It("should return the difference of two sets", func() { - s1 := util.NewSet(0, 0, 1, 2, 3, 4) - s2 := util.NewSet(0, 3, 4, 5, 6, 7) + s1 := util.NewSet(0, 1, 2, 3, 4) + s2 := util.NewSet(3, 4, 5, 6, 7) s3 := s1.Difference(s2) Expect(s3.Items()).To(ConsistOf(0, 1, 2)) }) diff --git a/pkg/util/util_suite_test.go b/pkg/util/util_suite_test.go new file mode 100644 index 0000000..6d6903e --- /dev/null +++ b/pkg/util/util_suite_test.go @@ -0,0 +1,13 @@ +package util_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestUtil(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Util Suite") +} From 04572316be66e1691a6e713d3b46e932b08ac6ab Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 1 Aug 2025 20:06:47 +0200 Subject: [PATCH 042/136] chore: captures empty capability option for job types with default fallbacks --- args/jobs.go | 10 ---------- types/jobs.go | 12 +++++++----- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/args/jobs.go b/args/jobs.go index 1a6c89f..d51701d 100644 --- a/args/jobs.go +++ b/args/jobs.go @@ -9,11 +9,6 @@ import ( // ValidateCapabilityForJobType validates that a capability is supported for the given job type func ValidateCapabilityForJobType(jobType teetypes.JobType, capability teetypes.Capability) error { - if capability == "" { - // Empty capability is allowed for some job types - return nil - } - validCaps, exists := teetypes.JobCapabilityMap[jobType] if !exists { return fmt.Errorf("unknown job type: %s", jobType) @@ -26,8 +21,3 @@ func ValidateCapabilityForJobType(jobType teetypes.JobType, capability teetypes. return nil } - -// IsCapabilityValidForJobType checks if a capability is valid for a job type without returning an error -func IsCapabilityValidForJobType(jobType teetypes.JobType, capability teetypes.Capability) bool { - return ValidateCapabilityForJobType(jobType, capability) == nil -} diff --git a/types/jobs.go b/types/jobs.go index 1ef8efb..c2373fb 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -57,13 +57,14 @@ const ( CapGetFollowing Capability = "getfollowing" CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" + CapEmpty Capability = "" ) // Capability group constants for easy reuse var ( - AlwaysAvailableWebCaps = []Capability{CapScraper} - AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry} - AlwaysAvailableTiktokCaps = []Capability{CapTranscription} + AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} + AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} + AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ @@ -78,13 +79,14 @@ var ( CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, CapGetHomeTweets, CapGetForYouTweets, CapGetProfileById, CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, + CapEmpty, } // TwitterAPICaps are basic Twitter capabilities available with API keys - TwitterAPICaps = []Capability{CapSearchByQuery, CapGetById, CapGetProfileById} + TwitterAPICaps = []Capability{CapSearchByQuery, CapGetById, CapGetProfileById, CapEmpty} // TwitterApifyCaps are Twitter capabilities available with Apify - TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing} + TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing, CapEmpty} ) // JobCapabilityMap defines which capabilities are valid for each job type From d8f7dff9e03c6654481abf203e8e57205514ffbf Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 1 Aug 2025 20:26:09 +0200 Subject: [PATCH 043/136] fix: refactor valid capability checking to job type --- args/jobs.go | 23 ----------------------- args/linkedin.go | 2 +- args/tiktok.go | 2 +- args/twitter.go | 2 +- args/web.go | 2 +- types/jobs.go | 20 ++++++++++++++++++++ 6 files changed, 24 insertions(+), 27 deletions(-) delete mode 100644 args/jobs.go diff --git a/args/jobs.go b/args/jobs.go deleted file mode 100644 index d51701d..0000000 --- a/args/jobs.go +++ /dev/null @@ -1,23 +0,0 @@ -package args - -import ( - "fmt" - "slices" - - teetypes "github.com/masa-finance/tee-types/types" -) - -// ValidateCapabilityForJobType validates that a capability is supported for the given job type -func ValidateCapabilityForJobType(jobType teetypes.JobType, capability teetypes.Capability) error { - validCaps, exists := teetypes.JobCapabilityMap[jobType] - if !exists { - return fmt.Errorf("unknown job type: %s", jobType) - } - - if !slices.Contains(validCaps, capability) { - return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", - capability, jobType, validCaps) - } - - return nil -} diff --git a/args/linkedin.go b/args/linkedin.go index dcdad5f..721f06b 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -76,7 +76,7 @@ func (l *LinkedInArguments) ValidateForJobType(jobType teetypes.JobType) error { } // Validate QueryType against job-specific capabilities - return ValidateCapabilityForJobType(jobType, teetypes.Capability(l.QueryType)) + return jobType.ValidateCapability(teetypes.Capability(l.QueryType)) } // GetCapability returns the QueryType as a typed Capability diff --git a/args/tiktok.go b/args/tiktok.go index bce3e73..4ce185a 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -97,7 +97,7 @@ func (t *TikTokTranscriptionArguments) ValidateForJobType(jobType teetypes.JobTy } // Validate capability against job-specific capabilities - return ValidateCapabilityForJobType(jobType, t.GetCapability()) + return jobType.ValidateCapability(t.GetCapability()) } // validateLanguageCode validates the language code format diff --git a/args/twitter.go b/args/twitter.go index 8fce9a4..55d875a 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -62,7 +62,7 @@ func (t *TwitterSearchArguments) ValidateForJobType(jobType teetypes.JobType) er } // Validate QueryType against job-specific capabilities - return ValidateCapabilityForJobType(jobType, teetypes.Capability(t.QueryType)) + return jobType.ValidateCapability(teetypes.Capability(t.QueryType)) } // GetCapability returns the QueryType as a typed Capability diff --git a/args/web.go b/args/web.go index fea71e0..39b0b7a 100644 --- a/args/web.go +++ b/args/web.go @@ -70,7 +70,7 @@ func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error } // Validate capability against job-specific capabilities - return ValidateCapabilityForJobType(jobType, w.GetCapability()) + return jobType.ValidateCapability(w.GetCapability()) } // GetCapability returns the capability for web operations (always scraper) diff --git a/types/jobs.go b/types/jobs.go index c2373fb..0174f5a 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -1,5 +1,10 @@ package types +import ( + "fmt" + "slices" +) + type JobType string type Capability string type WorkerCapabilities map[JobType][]Capability @@ -9,6 +14,21 @@ func (j JobType) String() string { return string(j) } +// ValidateCapability validates that a capability is supported for this job type +func (j JobType) ValidateCapability(capability Capability) error { + validCaps, exists := JobCapabilityMap[j] + if !exists { + return fmt.Errorf("unknown job type: %s", j) + } + + if !slices.Contains(validCaps, capability) { + return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", + capability, j, validCaps) + } + + return nil +} + // combineCapabilities combines multiple capability slices and ensures uniqueness func combineCapabilities(capSlices ...[]Capability) []Capability { seen := make(map[Capability]bool) From 219c68d833c13933477882ed7cdcc3a11cbbc8fc Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 1 Aug 2025 20:31:36 +0200 Subject: [PATCH 044/136] chore: simplified tiktok url validation --- args/tiktok.go | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index 4ce185a..4b57ed2 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -28,11 +28,6 @@ func (t *TikTokTranscriptionArguments) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) } - // // Normalize language to lowercase if provided - // if t.Language != "" { - // t.Language = strings.ToLower(t.Language) - // } - return t.Validate() } @@ -71,10 +66,7 @@ func (t *TikTokTranscriptionArguments) GetCapability() teetypes.Capability { // IsTikTokURL validates if the URL is a TikTok URL func (t *TikTokTranscriptionArguments) IsTikTokURL(parsedURL *url.URL) bool { host := strings.ToLower(parsedURL.Host) - return host == "tiktok.com" || - host == "www.tiktok.com" || - host == "vm.tiktok.com" || - strings.HasSuffix(host, ".tiktok.com") + return host == "tiktok.com" || strings.HasSuffix(host, ".tiktok.com") } // HasLanguagePreference returns true if a language preference is specified From b5867b56549dcf02d91e7aeed4501a59442f4d17 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 1 Aug 2025 20:32:32 +0200 Subject: [PATCH 045/136] fix: remove unecessary ifs --- args/twitter.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/args/twitter.go b/args/twitter.go index 55d875a..a32f024 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -33,9 +33,7 @@ func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { } // Normalize QueryType to lowercase - if t.QueryType != "" { - t.QueryType = strings.ToLower(t.QueryType) - } + t.QueryType = strings.ToLower(t.QueryType) return t.Validate() } From 3469af4935386ef4dceb52aac5f4d37ea3c4dba3 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 1 Aug 2025 20:47:24 +0200 Subject: [PATCH 046/136] fix: removes interface --- args/unmarshaller.go | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 7308411..efd1b78 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -7,15 +7,15 @@ import ( "github.com/masa-finance/tee-types/types" ) -// JobArgumentsInterface defines the interface that all job arguments must implement -type JobArgumentsInterface interface { +// JobArguments defines the interface that all job arguments must implement +type JobArguments interface { Validate() error GetCapability() types.Capability } -// TwitterJobArgumentsInterface extends JobArgumentsInterface for Twitter-specific methods -type TwitterJobArgumentsInterface interface { - JobArgumentsInterface +// TwitterJobArguments extends JobArguments for Twitter-specific methods +type TwitterJobArguments interface { + JobArguments ValidateForJobType(jobType types.JobType) error IsSingleTweetOperation() bool IsMultipleTweetOperation() bool @@ -25,32 +25,32 @@ type TwitterJobArgumentsInterface interface { IsTrendsOperation() bool } -// WebJobArgumentsInterface extends JobArgumentsInterface for Web-specific methods -type WebJobArgumentsInterface interface { - JobArgumentsInterface +// WebJobArguments extends JobArguments for Web-specific methods +type WebJobArguments interface { + JobArguments ValidateForJobType(jobType types.JobType) error IsDeepScrape() bool HasSelector() bool GetEffectiveMaxDepth() int } -// TikTokJobArgumentsInterface extends JobArgumentsInterface for TikTok-specific methods -type TikTokJobArgumentsInterface interface { - JobArgumentsInterface +// TikTokJobArguments extends JobArguments for TikTok-specific methods +type TikTokJobArguments interface { + JobArguments ValidateForJobType(jobType types.JobType) error HasLanguagePreference() bool GetLanguageCode() string } -// LinkedInJobArgumentsInterface extends JobArgumentsInterface for LinkedIn-specific methods -type LinkedInJobArgumentsInterface interface { - JobArgumentsInterface +// LinkedInJobArguments extends JobArguments for LinkedIn-specific methods +type LinkedInJobArguments interface { + JobArguments ValidateForJobType(jobType types.JobType) error } // UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct // This works with both tee-indexer and tee-worker JobArguments types -func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgumentsInterface, error) { +func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { switch jobType { case types.WebJob: return unmarshalWebArguments(args) @@ -135,7 +135,7 @@ func (t *TelemetryJobArguments) GetCapability() types.Capability { } // Type assertion helpers -func AsWebArguments(args JobArgumentsInterface) (WebJobArgumentsInterface, bool) { +func AsWebArguments(args JobArguments) (WebJobArguments, bool) { webArgs, ok := args.(*WebSearchArguments) if !ok { return nil, false @@ -143,7 +143,7 @@ func AsWebArguments(args JobArgumentsInterface) (WebJobArgumentsInterface, bool) return webArgs, true } -func AsTwitterArguments(args JobArgumentsInterface) (TwitterJobArgumentsInterface, bool) { +func AsTwitterArguments(args JobArguments) (TwitterJobArguments, bool) { twitterArgs, ok := args.(*TwitterSearchArguments) if !ok { return nil, false @@ -151,7 +151,7 @@ func AsTwitterArguments(args JobArgumentsInterface) (TwitterJobArgumentsInterfac return twitterArgs, true } -func AsTikTokArguments(args JobArgumentsInterface) (TikTokJobArgumentsInterface, bool) { +func AsTikTokArguments(args JobArguments) (TikTokJobArguments, bool) { tiktokArgs, ok := args.(*TikTokTranscriptionArguments) if !ok { return nil, false @@ -159,12 +159,12 @@ func AsTikTokArguments(args JobArgumentsInterface) (TikTokJobArgumentsInterface, return tiktokArgs, true } -func AsTelemetryArguments(args JobArgumentsInterface) (*TelemetryJobArguments, bool) { +func AsTelemetryArguments(args JobArguments) (*TelemetryJobArguments, bool) { telemetryArgs, ok := args.(*TelemetryJobArguments) return telemetryArgs, ok } -func AsLinkedInArguments(args JobArgumentsInterface) (LinkedInJobArgumentsInterface, bool) { +func AsLinkedInArguments(args JobArguments) (LinkedInJobArguments, bool) { linkedInArgs, ok := args.(*LinkedInArguments) if !ok { return nil, false From ca00ba2cd813e8d7e0ab12b3665334dd9fc4326e Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 5 Aug 2025 21:07:56 +0200 Subject: [PATCH 047/136] fix: favors builtin maps and go idioms for combined capabilities --- types/jobs.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index 0174f5a..668a45f 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -3,6 +3,8 @@ package types import ( "fmt" "slices" + + "golang.org/x/exp/maps" ) type JobType string @@ -31,19 +33,15 @@ func (j JobType) ValidateCapability(capability Capability) error { // combineCapabilities combines multiple capability slices and ensures uniqueness func combineCapabilities(capSlices ...[]Capability) []Capability { - seen := make(map[Capability]bool) - var result []Capability + capMap := make(map[Capability]struct{}) for _, capSlice := range capSlices { for _, cap := range capSlice { - if !seen[cap] { - seen[cap] = true - result = append(result, cap) - } + capMap[cap] = struct{}{} } } - return result + return maps.Keys(capMap) } // Job type constants - centralized from tee-indexer and tee-worker From 208cbacd6456ab455a1f20661eccf65e14759f5a Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 5 Aug 2025 21:10:41 +0200 Subject: [PATCH 048/136] chore: lowercase error messaging --- args/unmarshaller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index efd1b78..4ca6635 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -101,7 +101,7 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi // Perform job-type-specific validation for Twitter if err := twitterArgs.ValidateForJobType(jobType); err != nil { - return nil, fmt.Errorf("Twitter job validation failed: %w", err) + return nil, fmt.Errorf("twitter job validation failed: %w", err) } return twitterArgs, nil From 86fcdf3564b3ae8466fb98fa1fa43f0df25e05a4 Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Thu, 7 Aug 2025 11:24:21 -0700 Subject: [PATCH 049/136] update args/tiktok.go Co-authored-by: Mario Camou --- args/tiktok.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/tiktok.go b/args/tiktok.go index 4b57ed2..2ea223f 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -34,7 +34,7 @@ func (t *TikTokTranscriptionArguments) UnmarshalJSON(data []byte) error { // Validate validates the TikTok arguments func (t *TikTokTranscriptionArguments) Validate() error { if t.VideoURL == "" { - return fmt.Errorf("video_url is required") + return errors.New("video_url is required") } // Validate URL format From 13127c424a1d72e7166a1793198080f729f300bf Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Thu, 7 Aug 2025 11:24:38 -0700 Subject: [PATCH 050/136] update args/tiktok.go Co-authored-by: Mario Camou --- args/tiktok.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/tiktok.go b/args/tiktok.go index 2ea223f..d094fec 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -40,7 +40,7 @@ func (t *TikTokTranscriptionArguments) Validate() error { // Validate URL format parsedURL, err := url.Parse(t.VideoURL) if err != nil { - return fmt.Errorf("invalid video_url format: %w", err) + return errors.New("invalid video_url format: %w", err) } // Basic TikTok URL validation From 507d86b0d3f26c1ad1c56ca88cf493ed8be6063a Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Thu, 7 Aug 2025 11:24:57 -0700 Subject: [PATCH 051/136] update args/tiktok.go Co-authored-by: Mario Camou --- args/tiktok.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/tiktok.go b/args/tiktok.go index d094fec..188ae56 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -45,7 +45,7 @@ func (t *TikTokTranscriptionArguments) Validate() error { // Basic TikTok URL validation if !t.IsTikTokURL(parsedURL) { - return fmt.Errorf("URL must be a valid TikTok video URL") + return errors.New("URL must be a valid TikTok video URL") } // Validate language format if provided From a25f9c1afb7961c9dba96a6bf6432bc1e7ff8261 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 7 Aug 2025 20:32:20 +0200 Subject: [PATCH 052/136] fix: uses utils and improves linked unmarshalling --- args/linkedin.go | 49 ++++++++++++++++++++++++++------------------ args/unmarshaller.go | 24 ++++++++++++++++++++++ types/jobs.go | 20 ++++++++++-------- 3 files changed, 65 insertions(+), 28 deletions(-) diff --git a/args/linkedin.go b/args/linkedin.go index 721f06b..ceab1ad 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -32,31 +32,15 @@ func (l *LinkedInArguments) UnmarshalJSON(data []byte) error { } // Normalize QueryType to lowercase - if l.QueryType != "" { - l.QueryType = strings.ToLower(l.QueryType) - } + l.QueryType = strings.ToLower(l.QueryType) return l.Validate() } -// Validate validates the LinkedIn arguments +// Validate validates the LinkedIn arguments (general validation) func (l *LinkedInArguments) Validate() error { - if l.QueryType == "" { - return fmt.Errorf("type is required") - } - - // Validate query type - validTypes := map[string]bool{ - "searchbyquery": true, - "getprofile": true, - } - if !validTypes[l.QueryType] { - return fmt.Errorf("invalid type: %s, must be one of: searchbyquery, getprofile", l.QueryType) - } - - if l.Query == "" { - return fmt.Errorf("query is required") - } + // Note: QueryType is not required for all capabilities, similar to Twitter pattern + // Query is also not required for all capabilities if l.MaxResults < 0 { return fmt.Errorf("max_results must be non-negative, got: %d", l.MaxResults) @@ -84,6 +68,31 @@ func (l *LinkedInArguments) GetCapability() teetypes.Capability { return teetypes.Capability(l.QueryType) } +// IsSearchOperation returns true if this is a search operation +func (l *LinkedInArguments) IsSearchOperation() bool { + capability := l.GetCapability() + return capability == teetypes.CapSearchByQuery +} + +// IsProfileOperation returns true if this is a profile operation +func (l *LinkedInArguments) IsProfileOperation() bool { + capability := l.GetCapability() + return capability == teetypes.CapGetProfile +} + +// HasNetworkFilters returns true if network filters are specified +func (l *LinkedInArguments) HasNetworkFilters() bool { + return len(l.NetworkFilters) > 0 +} + +// GetEffectiveMaxResults returns the effective maximum results, defaulting to a reasonable limit +func (l *LinkedInArguments) GetEffectiveMaxResults() int { + if l.MaxResults <= 0 { + return 10 // Default to 10 results + } + return l.MaxResults +} + // LinkedInSearchArguments is an alias for LinkedInArguments for backward compatibility. // Deprecated: use LinkedInArguments instead. type LinkedInSearchArguments = LinkedInArguments diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 4ca6635..e7adf95 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -61,6 +61,9 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum case types.TwitterJob, types.TwitterCredentialJob, types.TwitterApiJob, types.TwitterApifyJob: return unmarshalTwitterArguments(jobType, args) + case types.LinkedInJob: + return unmarshalLinkedInArguments(jobType, args) + case types.TelemetryJob: return &TelemetryJobArguments{}, nil @@ -107,6 +110,27 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi return twitterArgs, nil } +func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*LinkedInArguments, error) { + linkedInArgs := &LinkedInArguments{} + if err := unmarshalToStruct(args, linkedInArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) + } + + // If no QueryType is specified, use the default capability for this job type + if linkedInArgs.QueryType == "" { + if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { + linkedInArgs.QueryType = string(defaultCap) + } + } + + // Perform job-type-specific validation for LinkedIn + if err := linkedInArgs.ValidateForJobType(jobType); err != nil { + return nil, fmt.Errorf("linkedin job validation failed: %w", err) + } + + return linkedInArgs, nil +} + // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal // This provides the same functionality as the existing JobArguments.Unmarshal methods func unmarshalToStruct(args map[string]any, target any) error { diff --git a/types/jobs.go b/types/jobs.go index 668a45f..0f47f14 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -4,7 +4,7 @@ import ( "fmt" "slices" - "golang.org/x/exp/maps" + "github.com/masa-finance/tee-types/pkg/util" ) type JobType string @@ -33,15 +33,11 @@ func (j JobType) ValidateCapability(capability Capability) error { // combineCapabilities combines multiple capability slices and ensures uniqueness func combineCapabilities(capSlices ...[]Capability) []Capability { - capMap := make(map[Capability]struct{}) - + caps := util.NewSet[Capability]() for _, capSlice := range capSlices { - for _, cap := range capSlice { - capMap[cap] = struct{}{} - } + caps.Add(capSlice...) } - - return maps.Keys(capMap) + return caps.Items() } // Job type constants - centralized from tee-indexer and tee-worker @@ -53,6 +49,7 @@ const ( TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify + LinkedInJob JobType = "linkedin" // LinkedIn scraping and profile operations ) // Capability constants - typed to prevent typos and enable discoverability @@ -75,6 +72,7 @@ const ( CapGetFollowing Capability = "getfollowing" CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" + CapGetProfile Capability = "getprofile" // LinkedIn get profile capability CapEmpty Capability = "" ) @@ -83,12 +81,14 @@ var ( AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} + AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ WebJob: AlwaysAvailableWebCaps, TelemetryJob: AlwaysAvailableTelemetryCaps, TiktokJob: AlwaysAvailableTiktokCaps, + LinkedInJob: AlwaysAvailableLinkedInCaps, } // TwitterCredentialCaps are all Twitter capabilities available with credential-based auth @@ -123,6 +123,9 @@ var JobCapabilityMap = map[JobType][]Capability{ ), TwitterApifyJob: TwitterApifyCaps, + // LinkedIn job capabilities + LinkedInJob: AlwaysAvailableLinkedInCaps, + // Web job capabilities WebJob: AlwaysAvailableWebCaps, @@ -139,6 +142,7 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterCredentialJob: CapSearchByQuery, TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, + LinkedInJob: CapSearchByQuery, WebJob: CapScraper, TiktokJob: CapTranscription, TelemetryJob: CapTelemetry, From cb4ed485deb154de440ce8fdbadf45526491ae2d Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 7 Aug 2025 20:42:04 +0200 Subject: [PATCH 053/136] fix: adds max util where possible --- args/linkedin.go | 6 ++---- args/tiktok.go | 3 ++- args/web.go | 6 ++---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/args/linkedin.go b/args/linkedin.go index ceab1ad..b7df066 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" + "github.com/masa-finance/tee-types/pkg/util" teetypes "github.com/masa-finance/tee-types/types" ) @@ -87,10 +88,7 @@ func (l *LinkedInArguments) HasNetworkFilters() bool { // GetEffectiveMaxResults returns the effective maximum results, defaulting to a reasonable limit func (l *LinkedInArguments) GetEffectiveMaxResults() int { - if l.MaxResults <= 0 { - return 10 // Default to 10 results - } - return l.MaxResults + return util.Max(l.MaxResults, 10) } // LinkedInSearchArguments is an alias for LinkedInArguments for backward compatibility. diff --git a/args/tiktok.go b/args/tiktok.go index 188ae56..9fa5aad 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -2,6 +2,7 @@ package args import ( "encoding/json" + "errors" "fmt" "net/url" "strings" @@ -40,7 +41,7 @@ func (t *TikTokTranscriptionArguments) Validate() error { // Validate URL format parsedURL, err := url.Parse(t.VideoURL) if err != nil { - return errors.New("invalid video_url format: %w", err) + return errors.New("invalid video_url format") } // Basic TikTok URL validation diff --git a/args/web.go b/args/web.go index 39b0b7a..06a5699 100644 --- a/args/web.go +++ b/args/web.go @@ -6,6 +6,7 @@ import ( "net/url" teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-types/pkg/util" ) type WebSearchArguments struct { @@ -90,8 +91,5 @@ func (w *WebSearchArguments) HasSelector() bool { // GetEffectiveMaxDepth returns the effective maximum depth for scraping func (w *WebSearchArguments) GetEffectiveMaxDepth() int { - if w.MaxDepth <= 0 { - return 1 // Default to single page - } - return w.MaxDepth + return util.Max(w.MaxDepth, 1) } From d9c63c756203b70404f64bc49946be9ab3c29181 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 7 Aug 2025 20:59:30 +0200 Subject: [PATCH 054/136] fix: reorder imports --- args/web.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/web.go b/args/web.go index 06a5699..f1f473f 100644 --- a/args/web.go +++ b/args/web.go @@ -5,8 +5,8 @@ import ( "fmt" "net/url" - teetypes "github.com/masa-finance/tee-types/types" "github.com/masa-finance/tee-types/pkg/util" + teetypes "github.com/masa-finance/tee-types/types" ) type WebSearchArguments struct { From 7c4bb8de7192531caf8624064e6e5dc7aa3ef10a Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 7 Aug 2025 21:32:51 +0200 Subject: [PATCH 055/136] fix: removes linked from available capabilities --- types/jobs.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index 0f47f14..6683411 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -49,7 +49,7 @@ const ( TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify - LinkedInJob JobType = "linkedin" // LinkedIn scraping and profile operations + LinkedInJob JobType = "linkedin" // LinkedIn scraping, keeping for unmarshalling logic ) // Capability constants - typed to prevent typos and enable discoverability @@ -88,7 +88,7 @@ var ( WebJob: AlwaysAvailableWebCaps, TelemetryJob: AlwaysAvailableTelemetryCaps, TiktokJob: AlwaysAvailableTiktokCaps, - LinkedInJob: AlwaysAvailableLinkedInCaps, + // LinkedInJob: AlwaysAvailableLinkedInCaps, } // TwitterCredentialCaps are all Twitter capabilities available with credential-based auth @@ -124,7 +124,7 @@ var JobCapabilityMap = map[JobType][]Capability{ TwitterApifyJob: TwitterApifyCaps, // LinkedIn job capabilities - LinkedInJob: AlwaysAvailableLinkedInCaps, + // LinkedInJob: AlwaysAvailableLinkedInCaps, // Web job capabilities WebJob: AlwaysAvailableWebCaps, @@ -142,8 +142,8 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterCredentialJob: CapSearchByQuery, TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, - LinkedInJob: CapSearchByQuery, - WebJob: CapScraper, - TiktokJob: CapTranscription, - TelemetryJob: CapTelemetry, + // LinkedInJob: CapSearchByQuery, + WebJob: CapScraper, + TiktokJob: CapTranscription, + TelemetryJob: CapTelemetry, } From 979587f78799886f76ef383d497001fdd016b9dc Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 7 Aug 2025 21:34:50 +0200 Subject: [PATCH 056/136] fix: clean up linked capability --- types/jobs.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index 6683411..945eb63 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -88,7 +88,6 @@ var ( WebJob: AlwaysAvailableWebCaps, TelemetryJob: AlwaysAvailableTelemetryCaps, TiktokJob: AlwaysAvailableTiktokCaps, - // LinkedInJob: AlwaysAvailableLinkedInCaps, } // TwitterCredentialCaps are all Twitter capabilities available with credential-based auth @@ -123,9 +122,6 @@ var JobCapabilityMap = map[JobType][]Capability{ ), TwitterApifyJob: TwitterApifyCaps, - // LinkedIn job capabilities - // LinkedInJob: AlwaysAvailableLinkedInCaps, - // Web job capabilities WebJob: AlwaysAvailableWebCaps, @@ -142,8 +138,7 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterCredentialJob: CapSearchByQuery, TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, - // LinkedInJob: CapSearchByQuery, - WebJob: CapScraper, - TiktokJob: CapTranscription, - TelemetryJob: CapTelemetry, + WebJob: CapScraper, + TiktokJob: CapTranscription, + TelemetryJob: CapTelemetry, } From 5876a14e8cd62aa854bf3cbf631e5de27bbba02f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 8 Aug 2025 20:53:38 +0200 Subject: [PATCH 057/136] chore: address final comments in tiktok and twitter --- args/tiktok.go | 2 +- types/twitter.go | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index 9fa5aad..dd68e6f 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -41,7 +41,7 @@ func (t *TikTokTranscriptionArguments) Validate() error { // Validate URL format parsedURL, err := url.Parse(t.VideoURL) if err != nil { - return errors.New("invalid video_url format") + return fmt.Errorf("invalid video_url format: %w", err) } // Basic TikTok URL validation diff --git a/types/twitter.go b/types/twitter.go index 74b812b..cb020e6 100644 --- a/types/twitter.go +++ b/types/twitter.go @@ -138,9 +138,17 @@ type ProfileResultApify struct { } type ProfileEntities struct { - Description DescriptionEntities `json:"description"` + URL *URLEntities `json:"url,omitempty"` + Description *URLEntities `json:"description,omitempty"` } -type DescriptionEntities struct { - URLs []any `json:"urls"` +type URLEntities struct { + URLs []URLEntity `json:"urls,omitempty"` +} + +type URLEntity struct { + URL string `json:"url"` + ExpandedURL string `json:"expanded_url"` + DisplayURL string `json:"display_url"` + Indices []int `json:"indices"` } From b8bfa5fe2247e8dd923c99e088fab2e418c06905 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 12 Aug 2025 02:51:29 +0200 Subject: [PATCH 058/136] fix: video url getter on tiktok tee types --- args/tiktok.go | 5 +++++ args/unmarshaller.go | 1 + 2 files changed, 6 insertions(+) diff --git a/args/tiktok.go b/args/tiktok.go index dd68e6f..cfe60fc 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -75,6 +75,11 @@ func (t *TikTokTranscriptionArguments) HasLanguagePreference() bool { return t.Language != "" } +// GetVideoURL returns the source video URL +func (t *TikTokTranscriptionArguments) GetVideoURL() string { + return t.VideoURL +} + // GetLanguageCode returns the language code, defaulting to "en-us" if not specified func (t *TikTokTranscriptionArguments) GetLanguageCode() string { if t.Language == "" { diff --git a/args/unmarshaller.go b/args/unmarshaller.go index e7adf95..f8d0319 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -39,6 +39,7 @@ type TikTokJobArguments interface { JobArguments ValidateForJobType(jobType types.JobType) error HasLanguagePreference() bool + GetVideoURL() string GetLanguageCode() string } From 482839a9a84112d4012d1cdcdb1eaeb6a8c473fe Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 12 Aug 2025 03:04:32 +0200 Subject: [PATCH 059/136] fix: web unmarshaller --- args/unmarshaller.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index f8d0319..353bf8f 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -160,12 +160,9 @@ func (t *TelemetryJobArguments) GetCapability() types.Capability { } // Type assertion helpers -func AsWebArguments(args JobArguments) (WebJobArguments, bool) { +func AsWebArguments(args JobArguments) (*WebSearchArguments, bool) { webArgs, ok := args.(*WebSearchArguments) - if !ok { - return nil, false - } - return webArgs, true + return webArgs, ok } func AsTwitterArguments(args JobArguments) (TwitterJobArguments, bool) { From 6df9f912720262a9f6d0cc7896ca0864a8f7fac7 Mon Sep 17 00:00:00 2001 From: mcamou Date: Tue, 12 Aug 2025 19:55:27 +0200 Subject: [PATCH 060/136] Add DO NOT MERGE / FIXME test for merging --- .github/workflows/tests.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ad34dad..d8135e3 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -52,3 +52,16 @@ jobs: - name: Run unit tests run: | go test ./... + + ready-to-merge: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Verify that merging is OK + run: | + if grep -rE 'DO[ ]NOT[ ]MERGE|[F]IXME' .; then + exit 1 + fi From e2e1f30e1d242353236c44e6d7baa74d5def2e3f Mon Sep 17 00:00:00 2001 From: mcamou Date: Tue, 12 Aug 2025 20:00:56 +0200 Subject: [PATCH 061/136] Add space before FIXME --- .github/workflows/tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index d8135e3..3e6b924 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -62,6 +62,6 @@ jobs: - name: Verify that merging is OK run: | - if grep -rE 'DO[ ]NOT[ ]MERGE|[F]IXME' .; then + if grep -rE 'DO[ ]NOT[ ]MERGE|[ ]FIXME' .; then exit 1 fi From 5d2c4db0a5efff8e09988b2b7505e075baa4b84d Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 14 Aug 2025 19:37:22 +0200 Subject: [PATCH 062/136] feat: adds profile result type from scraper and json tags for twitter types --- types/twitter.go | 129 +++++++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/types/twitter.go b/types/twitter.go index cb020e6..e699fbd 100644 --- a/types/twitter.go +++ b/types/twitter.go @@ -4,72 +4,72 @@ package types import "time" type TweetResult struct { - ID int64 `json:"id"` - TweetID string - ConversationID string - UserID string - Text string - CreatedAt time.Time - Timestamp int64 + ID int64 `json:"id"` + TweetID string `json:"tweet_id"` + ConversationID string `json:"conversation_id"` + UserID string `json:"user_id"` + Text string `json:"text"` + CreatedAt time.Time `json:"created_at"` + Timestamp int64 `json:"timestamp"` ThreadCursor struct { - FocalTweetID string - ThreadID string - Cursor string - CursorType string + FocalTweetID string `json:"focal_tweet_id"` + ThreadID string `json:"thread_id"` + Cursor string `json:"cursor"` + CursorType string `json:"cursor_type"` } - IsQuoted bool - IsPin bool - IsReply bool - IsRetweet bool - IsSelfThread bool - Likes int - Hashtags []string - HTML string - Replies int - Retweets int - URLs []string - Username string + IsQuoted bool `json:"is_quoted"` + IsPin bool `json:"is_pin"` + IsReply bool `json:"is_reply"` + IsRetweet bool `json:"is_retweet"` + IsSelfThread bool `json:"is_self_thread"` + Likes int `json:"likes"` + Hashtags []string `json:"hashtags"` + HTML string `json:"html"` + Replies int `json:"replies"` + Retweets int `json:"retweets"` + URLs []string `json:"urls"` + Username string `json:"username"` - Photos []Photo + Photos []Photo `json:"photos"` // Video type. - Videos []Video + Videos []Video `json:"videos"` - RetweetedStatusID string - Views int - SensitiveContent bool + RetweetedStatusID string `json:"retweeted_status_id"` + Views int `json:"views"` + SensitiveContent bool `json:"sensitive_content"` // from twitterx - AuthorID string - PublicMetrics PublicMetrics - PossiblySensitive bool - Lang string - NewestID string - OldestID string - ResultCount int + AuthorID string `json:"author_id"` + PublicMetrics PublicMetrics `json:"public_metrics"` + PossiblySensitive bool `json:"possibly_sensitive"` + Lang string `json:"lang"` + NewestID string `json:"newest_id"` + OldestID string `json:"oldest_id"` + ResultCount int `json:"result_count"` - Error error + Error error `json:"error"` } type PublicMetrics struct { - RetweetCount int - ReplyCount int - LikeCount int - QuoteCount int - BookmarkCount int - ImpressionCount int + RetweetCount int `json:"retweet_count"` + ReplyCount int `json:"reply_count"` + LikeCount int `json:"like_count"` + QuoteCount int `json:"quote_count"` + BookmarkCount int `json:"bookmark_count"` + ImpressionCount int `json:"impression_count"` } type Photo struct { - ID string - URL string + ID string `json:"id"` + URL string `json:"url"` } type Video struct { - ID string - Preview string - URL string - HLSURL string + ID string `json:"id"` + Preview string `json:"preview"` + URL string `json:"url"` + HLSURL string `json:"hls_url"` } type ProfileResultApify struct { @@ -152,3 +152,36 @@ type URLEntity struct { DisplayURL string `json:"display_url"` Indices []int `json:"indices"` } + +type ProfileResultScraper struct { + Avatar string `json:"avatar"` + Banner string `json:"banner"` + Biography string `json:"biography"` + Birthday string `json:"birthday"` + FollowersCount int `json:"followers_count"` + FollowingCount int `json:"following_count"` + FriendsCount int `json:"friends_count"` + IsPrivate bool `json:"is_private"` + IsVerified bool `json:"is_verified"` + IsBlueVerified bool `json:"is_blue_verified"` + Joined *time.Time `json:"joined"` + LikesCount int `json:"likes_count"` + ListedCount int `json:"listed_count"` + Location string `json:"location"` + Name string `json:"name"` + PinnedTweetIDs []string `json:"pinned_tweet_ids"` + TweetsCount int `json:"tweets_count"` + URL string `json:"url"` + UserID string `json:"user_id"` + Username string `json:"username"` + Website string `json:"website"` + Sensitive bool `json:"sensitive"` + Following bool `json:"following"` + FollowedBy bool `json:"followed_by"` + MediaCount int `json:"media_count"` + FastFollowersCount int `json:"fast_followers_count"` + NormalFollowersCount int `json:"normal_followers_count"` + ProfileImageShape string `json:"profile_image_shape"` + HasGraduatedAccess bool `json:"has_graduated_access"` + CanHighlightTweets bool `json:"can_highlight_tweets"` +} From 31c068560772cb65f57ffdae98cb1d762654ddb5 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 03:37:33 +0200 Subject: [PATCH 063/136] fix: eng-US default --- args/tiktok.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/tiktok.go b/args/tiktok.go index cfe60fc..6c487e6 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -83,7 +83,7 @@ func (t *TikTokTranscriptionArguments) GetVideoURL() string { // GetLanguageCode returns the language code, defaulting to "en-us" if not specified func (t *TikTokTranscriptionArguments) GetLanguageCode() string { if t.Language == "" { - return "en-US" + return "eng-US" } return t.Language } From 9a02ea545cb08a61d9f18215218f33eedaf28b12 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 19:24:50 +0200 Subject: [PATCH 064/136] chore: adds searchbyquery for tiktok job type --- types/jobs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types/jobs.go b/types/jobs.go index 945eb63..9e0ea4c 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -80,7 +80,7 @@ const ( var ( AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} - AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} + AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapSearchByQuery, CapEmpty} AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration From 90efdd2fcda5ce6de7ebe652f80bd7b46b62e7a2 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 19:26:54 +0200 Subject: [PATCH 065/136] fix: updates default tiktok capability --- types/jobs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types/jobs.go b/types/jobs.go index 9e0ea4c..2f1f66d 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -139,6 +139,6 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, WebJob: CapScraper, - TiktokJob: CapTranscription, + TiktokJob: CapSearchByQuery, TelemetryJob: CapTelemetry, } From 5211f8e420d68ccfa93753d5577bcabb849f0f21 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 19:41:40 +0200 Subject: [PATCH 066/136] feat: adds types for search by query on tiktok --- types/tiktok.go | 152 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/types/tiktok.go b/types/tiktok.go index 94f9f65..ab41efd 100644 --- a/types/tiktok.go +++ b/types/tiktok.go @@ -9,3 +9,155 @@ type TikTokTranscriptionResult struct { OriginalURL string `json:"original_url"` ThumbnailURL string `json:"thumbnail_url,omitempty"` } + +type TikTokSearchByQueryResult struct { + URL string `json:"url"` + ID string `json:"id"` + Desc string `json:"desc"` + CreateTime string `json:"create_time"` + ScheduleTime int64 `json:"schedule_time"` + Video TikTokVideo `json:"video"` + Author string `json:"author"` + Music TikTokMusic `json:"music"` + Challenges []any `json:"challenges"` // we don't have examples of this data yet... + Stats TikTokStats `json:"stats"` + IsActivityItem bool `json:"is_activity_item"` + DuetInfo TikTokDuetInfo `json:"duet_info"` + WarnInfo []any `json:"warn_info"` // we don't have examples of this data yet... + OriginalItem bool `json:"original_item"` + OfficalItem bool `json:"offical_item"` + TextExtra []TikTokTextExtra `json:"text_extra"` + Secret bool `json:"secret"` + ForFriend bool `json:"for_friend"` + Digged bool `json:"digged"` + ItemCommentStatus int `json:"item_comment_status"` + ShowNotPass bool `json:"show_not_pass"` + VL1 bool `json:"vl1"` + TakeDown int `json:"take_down"` + ItemMute bool `json:"item_mute"` + EffectStickers []any `json:"effect_stickers"` // we don't have examples of this data yet... + AuthorStats TikTokAuthorStats `json:"author_stats"` + PrivateItem bool `json:"private_item"` + DuetEnabled bool `json:"duet_enabled"` + StitchEnabled bool `json:"stitch_enabled"` + StickersOnItem []any `json:"stickers_on_item"` // we don't have examples of this data yet... + IsAd bool `json:"is_ad"` + ShareEnabled bool `json:"share_enabled"` + Comments []any `json:"comments"` // we don't have examples of this data yet... + DuetDisplay int `json:"duet_display"` + StitchDisplay int `json:"stitch_display"` + IndexEnabled bool `json:"index_enabled"` + DiversificationLabels []string `json:"diversification_labels"` + AdAuthorization bool `json:"ad_authorization"` + AdLabelVersion int `json:"ad_label_version"` + LocationCreated string `json:"location_created"` + Nickname string `json:"nickname"` + AuthorID string `json:"author_id"` + AuthorSecID string `json:"author_sec_id"` + AvatarThumb string `json:"avatar_thumb"` + DownloadSetting int `json:"download_setting"` + AuthorPrivate bool `json:"author_private"` +} + +type TikTokVideo struct { + ID string `json:"id"` + Height int `json:"height"` + Width int `json:"width"` + Duration int `json:"duration"` + Ratio string `json:"ratio"` + Cover string `json:"cover"` + OriginCover string `json:"origin_cover"` + DynamicCover string `json:"dynamic_cover"` + PlayAddr string `json:"play_addr"` + DownloadAddr string `json:"download_addr"` + ShareCover []string `json:"share_cover"` + ReflowCover string `json:"reflow_cover"` + Bitrate int `json:"bitrate"` + EncodedType string `json:"encoded_type"` + Format string `json:"format"` + VideoQuality string `json:"video_quality"` + EncodeUserTag string `json:"encode_user_tag"` + CodecType string `json:"codec_type"` + Definition string `json:"definition"` + SubtitleInfos []any `json:"subtitle_infos"` // we don't have examples of this data yet... + ZoomCover TikTokZoomCover `json:"zoom_cover"` + VolumeInfo TikTokVolumeInfo `json:"volume_info"` + BitrateInfo []TikTokBitrateInfo `json:"bitrate_info"` +} + +type TikTokZoomCover struct { + Cover240 string `json:"240"` + Cover480 string `json:"480"` + Cover720 string `json:"720"` + Cover960 string `json:"960"` +} + +type TikTokVolumeInfo struct { + Loudness float64 `json:"loudness"` + Peak float64 `json:"peak"` +} + +type TikTokBitrateInfo struct { + GearName string `json:"gear_name"` + Bitrate int `json:"bitrate"` + QualityType int `json:"quality_type"` + PlayAddr TikTokPlayAddr `json:"play_addr"` + CodecType string `json:"codec_type"` +} + +type TikTokPlayAddr struct { + Uri string `json:"uri"` + UrlList []string `json:"url_list"` + DataSize string `json:"data_size"` + UrlKey string `json:"url_key"` + FileHash string `json:"file_hash"` + FileCs string `json:"file_cs"` +} + +type TikTokMusic struct { + ID string `json:"id"` + Title string `json:"title"` + PlayURL string `json:"play_url"` + CoverLarge string `json:"cover_large"` + CoverMedium string `json:"cover_medium"` + CoverThumb string `json:"cover_thumb"` + AuthorName string `json:"author_name"` + Original bool `json:"original"` + Duration int `json:"duration"` + Album string `json:"album"` + ScheduleSearchTime int64 `json:"schedule_search_time"` +} + +type TikTokStats struct { + DiggCount int64 `json:"digg_count"` + ShareCount int64 `json:"share_count"` + CommentCount int64 `json:"comment_count"` + PlayCount int64 `json:"play_count"` +} + +type TikTokDuetInfo struct { + DuetFromID string `json:"duet_from_id"` +} + +type TikTokTextExtra struct { + AwemeID string `json:"aweme_id"` + Start int `json:"start"` + End int `json:"end"` + HashtagID string `json:"hashtag_id"` + HashtagName string `json:"hashtag_name"` + Type int `json:"type"` + SubType int `json:"sub_type"` + UserID string `json:"user_id"` + IsCommerce bool `json:"is_commerce"` + UserUniqueID string `json:"user_unique_id"` + SecUID string `json:"sec_uid"` +} + +type TikTokAuthorStats struct { + FollowerCount int64 `json:"follower_count"` + FollowingCount int64 `json:"following_count"` + Heart int64 `json:"heart"` + HeartCount int64 `json:"heart_count"` + VideoCount int64 `json:"video_count"` + DiggCount int64 `json:"digg_count"` +} From f949598c28535e094d0e8fce0f93f94661789e29 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 19:45:14 +0200 Subject: [PATCH 067/136] feat: adds search by trending type --- types/tiktok.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/types/tiktok.go b/types/tiktok.go index ab41efd..938bb2a 100644 --- a/types/tiktok.go +++ b/types/tiktok.go @@ -59,6 +59,17 @@ type TikTokSearchByQueryResult struct { AuthorPrivate bool `json:"author_private"` } +type TikTokSearchByTrending struct { + CountryCode string `json:"country_code"` + Cover string `json:"cover"` + Duration int `json:"duration"` + ID string `json:"id"` + ItemID string `json:"item_id"` + ItemURL string `json:"item_url"` + Region string `json:"region"` + Title string `json:"title"` +} + type TikTokVideo struct { ID string `json:"id"` Height int `json:"height"` From 4d116a2f75c2a3c30293c399d6aef1f0313ce01c Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 19:52:52 +0200 Subject: [PATCH 068/136] feat: adds searchbytrending type --- types/jobs.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/types/jobs.go b/types/jobs.go index 2f1f66d..55e13eb 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -58,6 +58,7 @@ const ( CapTelemetry Capability = "telemetry" CapTranscription Capability = "transcription" CapSearchByQuery Capability = "searchbyquery" + CapSearchByTrending Capability = "searchbytrending" CapSearchByFullArchive Capability = "searchbyfullarchive" CapSearchByProfile Capability = "searchbyprofile" CapGetById Capability = "getbyid" @@ -80,7 +81,7 @@ const ( var ( AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} - AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapSearchByQuery, CapEmpty} + AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapSearchByQuery, CapSearchByTrending, CapEmpty} AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration From ec331a786a856c8a1a4042d022390de21c9663ca Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 20:23:52 +0200 Subject: [PATCH 069/136] feat: adds tiktok arg unmarshalling --- args/tiktok.go | 116 +++++++++++++++++++++++++++++++++++++++++++ args/unmarshaller.go | 63 +++++++++++++++++------ 2 files changed, 164 insertions(+), 15 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index cfe60fc..cf81a4d 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -113,3 +113,119 @@ func (t *TikTokTranscriptionArguments) validateLanguageCode() error { return nil } + +// Proxy settings used by Apify input +type TikTokApifyProxySetting struct { + UseApifyProxy bool `json:"use_apify_proxy"` +} + +// TikTokSearchByQueryArguments defines args for epctex/tiktok-search-scraper +type TikTokSearchByQueryArguments struct { + QueryType string `json:"type"` + + Search []string `json:"search,omitempty"` + StartUrls []string `json:"start_urls,omitempty"` + MaxItems int `json:"max_items,omitempty"` + EndPage int `json:"end_page,omitempty"` + Proxy *TikTokApifyProxySetting `json:"proxy,omitempty"` +} + +func (t *TikTokSearchByQueryArguments) UnmarshalJSON(data []byte) error { + type Alias TikTokSearchByQueryArguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) + } + t.QueryType = strings.ToLower(t.QueryType) + if t.Proxy == nil { + t.Proxy = &TikTokApifyProxySetting{UseApifyProxy: true} + } + return t.Validate() +} + +func (t *TikTokSearchByQueryArguments) Validate() error { + if len(t.Search) == 0 && len(t.StartUrls) == 0 { + return errors.New("either 'search' or 'start_urls' is required for searchbyquery") + } + if t.MaxItems < 0 { + return fmt.Errorf("max_items must be non-negative, got: %d", t.MaxItems) + } + if t.EndPage < 0 { + return fmt.Errorf("end_page must be non-negative, got: %d", t.EndPage) + } + return nil +} + +func (t *TikTokSearchByQueryArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := jobType.ValidateCapability(teetypes.CapSearchByQuery); err != nil { + return err + } + return t.Validate() +} + +func (t *TikTokSearchByQueryArguments) GetCapability() teetypes.Capability { + return teetypes.CapSearchByQuery +} + +// TikTokSearchByTrendingArguments defines args for lexis-solutions/tiktok-trending-videos-scraper +type TikTokSearchByTrendingArguments struct { + QueryType string `json:"type"` + CountryCode string `json:"country_code,omitempty"` + SortBy string `json:"sort_by,omitempty"` + MaxItems int `json:"max_items,omitempty"` + Period string `json:"period,omitempty"` // "7" or "30" +} + +func (t *TikTokSearchByTrendingArguments) UnmarshalJSON(data []byte) error { + type Alias TikTokSearchByTrendingArguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal TikTok searchbytrending arguments: %w", err) + } + t.QueryType = strings.ToLower(t.QueryType) + if t.CountryCode == "" { + t.CountryCode = "US" + } + if t.SortBy == "" { + t.SortBy = "vv" + } + if t.Period == "" { + t.Period = "7" + } + return t.Validate() +} + +func (t *TikTokSearchByTrendingArguments) Validate() error { + allowedCountries := map[string]struct{}{ + "AU": {}, "BR": {}, "CA": {}, "EG": {}, "FR": {}, "DE": {}, "ID": {}, "IL": {}, "IT": {}, "JP": {}, + "MY": {}, "PH": {}, "RU": {}, "SA": {}, "SG": {}, "KR": {}, "ES": {}, "TW": {}, "TH": {}, "TR": {}, + "AE": {}, "GB": {}, "US": {}, "VN": {}, + } + if _, ok := allowedCountries[strings.ToUpper(t.CountryCode)]; !ok { + return fmt.Errorf("invalid country_code '%s'", t.CountryCode) + } + allowedSorts := map[string]struct{}{ + "vv": {}, "like": {}, "comment": {}, "repost": {}, + } + if _, ok := allowedSorts[strings.ToLower(t.SortBy)]; !ok { + return fmt.Errorf("invalid sort_by '%s'", t.SortBy) + } + if t.Period != "7" && t.Period != "30" { + return fmt.Errorf("invalid period '%s' (allowed: '7','30')", t.Period) + } + if t.MaxItems < 0 { + return fmt.Errorf("max_items must be non-negative, got: %d", t.MaxItems) + } + return nil +} + +func (t *TikTokSearchByTrendingArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := jobType.ValidateCapability(teetypes.CapSearchByTrending); err != nil { + return err + } + return t.Validate() +} + +func (t *TikTokSearchByTrendingArguments) GetCapability() teetypes.Capability { + return teetypes.CapSearchByTrending +} diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 353bf8f..c9f8085 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -3,6 +3,7 @@ package args import ( "encoding/json" "fmt" + "strings" "github.com/masa-finance/tee-types/types" ) @@ -82,12 +83,42 @@ func unmarshalWebArguments(args map[string]any) (*WebSearchArguments, error) { return webArgs, nil } -func unmarshalTikTokArguments(args map[string]any) (*TikTokTranscriptionArguments, error) { - tiktokArgs := &TikTokTranscriptionArguments{} - if err := unmarshalToStruct(args, tiktokArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok job arguments: %w", err) +func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { + if v, ok := args["type"]; ok { + if s, ok := v.(string); ok { + capability := types.Capability(strings.ToLower(s)) + if capability == types.CapSearchByQuery { + searchArgs := &TikTokSearchByQueryArguments{} + if err := unmarshalToStruct(args, searchArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) + } + if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { + return nil, fmt.Errorf("tiktok job validation failed: %w", err) + } + return searchArgs, nil + } + if capability == types.CapSearchByTrending { + searchArgs := &TikTokSearchByTrendingArguments{} + if err := unmarshalToStruct(args, searchArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok searchbytrending arguments: %w", err) + } + if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { + return nil, fmt.Errorf("tiktok job validation failed: %w", err) + } + return searchArgs, nil + } + } + } + + // Default to transcription args + transcriptionArgs := &TikTokTranscriptionArguments{} + if err := unmarshalToStruct(args, transcriptionArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok transcription arguments: %w", err) } - return tiktokArgs, nil + if err := transcriptionArgs.ValidateForJobType(types.TiktokJob); err != nil { + return nil, fmt.Errorf("tiktok job validation failed: %w", err) + } + return transcriptionArgs, nil } func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*TwitterSearchArguments, error) { @@ -173,7 +204,7 @@ func AsTwitterArguments(args JobArguments) (TwitterJobArguments, bool) { return twitterArgs, true } -func AsTikTokArguments(args JobArguments) (TikTokJobArguments, bool) { +func AsTikTokArguments(args JobArguments) (TikTokJobArguments, bool) { // Backward compat helper for transcription tiktokArgs, ok := args.(*TikTokTranscriptionArguments) if !ok { return nil, false @@ -181,15 +212,17 @@ func AsTikTokArguments(args JobArguments) (TikTokJobArguments, bool) { return tiktokArgs, true } -func AsTelemetryArguments(args JobArguments) (*TelemetryJobArguments, bool) { - telemetryArgs, ok := args.(*TelemetryJobArguments) - return telemetryArgs, ok +func AsTikTokTranscriptionArguments(args JobArguments) (*TikTokTranscriptionArguments, bool) { + v, ok := args.(*TikTokTranscriptionArguments) + return v, ok } -func AsLinkedInArguments(args JobArguments) (LinkedInJobArguments, bool) { - linkedInArgs, ok := args.(*LinkedInArguments) - if !ok { - return nil, false - } - return linkedInArgs, true +func AsTikTokSearchByQueryArguments(args JobArguments) (*TikTokSearchByQueryArguments, bool) { + v, ok := args.(*TikTokSearchByQueryArguments) + return v, ok +} + +func AsTikTokSearchByTrendingArguments(args JobArguments) (*TikTokSearchByTrendingArguments, bool) { + v, ok := args.(*TikTokSearchByTrendingArguments) + return v, ok } From b7e7dd9dcba1954f22a2a13acc525a8d0e6ff741 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 20:52:02 +0200 Subject: [PATCH 070/136] feat: adds tiktok args --- args/tiktok.go | 15 ++++++++ args/unmarshaller.go | 83 ++++++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index cf81a4d..4d6d8ee 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -88,6 +88,21 @@ func (t *TikTokTranscriptionArguments) GetLanguageCode() string { return t.Language } +// TikTokArguments provides a minimal structure to extract the QueryType (json "type") +type TikTokArguments struct { + QueryType string `json:"type"` +} + +func (t *TikTokArguments) UnmarshalJSON(data []byte) error { + type Alias TikTokArguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) + } + t.QueryType = strings.ToLower(t.QueryType) + return nil +} + // ValidateForJobType validates TikTok arguments for a specific job type func (t *TikTokTranscriptionArguments) ValidateForJobType(jobType teetypes.JobType) error { if err := t.Validate(); err != nil { diff --git a/args/unmarshaller.go b/args/unmarshaller.go index c9f8085..02aa18b 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -84,41 +84,51 @@ func unmarshalWebArguments(args map[string]any) (*WebSearchArguments, error) { } func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { - if v, ok := args["type"]; ok { - if s, ok := v.(string); ok { - capability := types.Capability(strings.ToLower(s)) - if capability == types.CapSearchByQuery { - searchArgs := &TikTokSearchByQueryArguments{} - if err := unmarshalToStruct(args, searchArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) - } - if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { - return nil, fmt.Errorf("tiktok job validation failed: %w", err) - } - return searchArgs, nil - } - if capability == types.CapSearchByTrending { - searchArgs := &TikTokSearchByTrendingArguments{} - if err := unmarshalToStruct(args, searchArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok searchbytrending arguments: %w", err) - } - if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { - return nil, fmt.Errorf("tiktok job validation failed: %w", err) - } - return searchArgs, nil - } + // Unmarshal minimally to read QueryType like we do for Twitter + minimal := &TikTokArguments{} + if err := unmarshalToStruct(args, minimal); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) + } + capability := types.Capability(strings.ToLower(minimal.QueryType)) + if capability == types.Capability("") { + defaultCap, exists := types.JobDefaultCapabilityMap[types.TiktokJob] + if !exists { + return nil, fmt.Errorf("no default capability configured for job type: %s", types.TiktokJob) } + capability = defaultCap } - // Default to transcription args - transcriptionArgs := &TikTokTranscriptionArguments{} - if err := unmarshalToStruct(args, transcriptionArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok transcription arguments: %w", err) - } - if err := transcriptionArgs.ValidateForJobType(types.TiktokJob); err != nil { - return nil, fmt.Errorf("tiktok job validation failed: %w", err) + switch capability { + case types.CapSearchByQuery: + searchArgs := &TikTokSearchByQueryArguments{} + if err := unmarshalToStruct(args, searchArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) + } + if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { + return nil, fmt.Errorf("tiktok job validation failed: %w", err) + } + return searchArgs, nil + case types.CapSearchByTrending: + searchArgs := &TikTokSearchByTrendingArguments{} + if err := unmarshalToStruct(args, searchArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok searchbytrending arguments: %w", err) + } + if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { + return nil, fmt.Errorf("tiktok job validation failed: %w", err) + } + return searchArgs, nil + case types.CapTranscription: + transcriptionArgs := &TikTokTranscriptionArguments{} + if err := unmarshalToStruct(args, transcriptionArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal TikTok transcription arguments: %w", err) + } + if err := transcriptionArgs.ValidateForJobType(types.TiktokJob); err != nil { + return nil, fmt.Errorf("tiktok job validation failed: %w", err) + } + return transcriptionArgs, nil + default: + return nil, fmt.Errorf("unknown tiktok type: %s", capability) } - return transcriptionArgs, nil } func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*TwitterSearchArguments, error) { @@ -204,13 +214,10 @@ func AsTwitterArguments(args JobArguments) (TwitterJobArguments, bool) { return twitterArgs, true } -func AsTikTokArguments(args JobArguments) (TikTokJobArguments, bool) { // Backward compat helper for transcription - tiktokArgs, ok := args.(*TikTokTranscriptionArguments) - if !ok { - return nil, false - } - return tiktokArgs, true -} +// Use specific helpers for TikTok argument types: +// - AsTikTokTranscriptionArguments +// - AsTikTokSearchByQueryArguments +// - AsTikTokSearchByTrendingArguments func AsTikTokTranscriptionArguments(args JobArguments) (*TikTokTranscriptionArguments, bool) { v, ok := args.(*TikTokTranscriptionArguments) From 781daf34657182b0666e54882d3e7a431f942aba Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 15 Aug 2025 21:25:51 +0200 Subject: [PATCH 071/136] fix: move tiktok caps --- types/jobs.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/types/jobs.go b/types/jobs.go index 55e13eb..6d38df8 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -81,7 +81,7 @@ const ( var ( AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} - AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapSearchByQuery, CapSearchByTrending, CapEmpty} + AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration @@ -105,6 +105,9 @@ var ( // TwitterApifyCaps are Twitter capabilities available with Apify TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing, CapEmpty} + + // TiktokSearchCaps are Tiktok capabilities available with Apify + TiktokSearchCaps = []Capability{CapSearchByQuery, CapSearchByTrending} ) // JobCapabilityMap defines which capabilities are valid for each job type @@ -127,7 +130,10 @@ var JobCapabilityMap = map[JobType][]Capability{ WebJob: AlwaysAvailableWebCaps, // TikTok job capabilities - TiktokJob: AlwaysAvailableTiktokCaps, + TiktokJob: combineCapabilities( + AlwaysAvailableTiktokCaps, + TiktokSearchCaps, + ), // Telemetry job capabilities TelemetryJob: AlwaysAvailableTelemetryCaps, From 8f9eca99d989e31aaa9d8062f869708d73221d1b Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 13:27:44 +0200 Subject: [PATCH 072/136] Add types for Reddit scraper --- .gitignore | 3 + args/args_suite_test.go | 13 +++ args/reddit.go | 150 ++++++++++++++++++++++++++++++ args/reddit_test.go | 178 +++++++++++++++++++++++++++++++++++ args/unmarshaller.go | 30 ++++++ args/unmarshaller_test.go | 105 +++++++++++++++++++++ pkg/util/set.go | 6 +- types/jobs.go | 16 +++- types/reddit.go | 190 ++++++++++++++++++++++++++++++++++++++ types/reddit_test.go | 93 +++++++++++++++++++ types/types_suite_test.go | 13 +++ 11 files changed, 794 insertions(+), 3 deletions(-) create mode 100644 args/args_suite_test.go create mode 100644 args/reddit.go create mode 100644 args/reddit_test.go create mode 100644 args/unmarshaller_test.go create mode 100644 types/reddit.go create mode 100644 types/reddit_test.go create mode 100644 types/types_suite_test.go diff --git a/.gitignore b/.gitignore index 9954081..f5f7bd6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,7 @@ go.work *~ *.log .DS_Store + +# LLM-related files .aider* +GEMINI.md diff --git a/args/args_suite_test.go b/args/args_suite_test.go new file mode 100644 index 0000000..861e0bf --- /dev/null +++ b/args/args_suite_test.go @@ -0,0 +1,13 @@ +package args_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/args/reddit.go b/args/reddit.go new file mode 100644 index 0000000..fad60f6 --- /dev/null +++ b/args/reddit.go @@ -0,0 +1,150 @@ +package args + +import ( + "encoding/json" + "errors" + "fmt" + "net/url" + "strings" + "time" + + "github.com/masa-finance/tee-types/pkg/util" + teetypes "github.com/masa-finance/tee-types/types" +) + +var ( + ErrRedditInvalidType = errors.New("invalid type") + ErrRedditInvalidSort = errors.New("invalid sort") + ErrRedditTimeInTheFuture = errors.New("after field is in the future") + ErrRedditNoQueries = errors.New("queries must be provided for all query types except scrapeurls") + ErrRedditNoUrls = errors.New("urls must be provided for scrapeurls query type") + ErrRedditQueriesNotAllowed = errors.New("the scrapeurls query type does not admit queries") + ErrRedditUrlsNotAllowed = errors.New("urls can only be provided for the scrapeurls query type") +) + +const ( + // These reflect the default values in https://apify.com/trudax/reddit-scraper/input-schema + redditDefaultMaxItems = 10 + redditDefaultMaxPosts = 10 + redditDefaultMaxComments = 10 + redditDefaultMaxCommunities = 2 + redditDefaultMaxUsers = 2 + redditDefaultSort = teetypes.RedditSortNew +) + +// RedditArguments defines args for Reddit scrapes +// see https://apify.com/trudax/reddit-scraper +type RedditArguments struct { + QueryType teetypes.RedditQueryType `json:"type"` + Queries []string `json:"queries"` + URLs []teetypes.RedditStartURL `json:"urls"` + Sort teetypes.RedditSortType `json:"sort"` + IncludeNSFW bool `json:"include_nsfw"` + SkipPosts bool `json:"skip_posts"` // Valid only for searchusers + After time.Time `json:"after"` // valid only for scrapeurls and searchposts + MaxItems uint `json:"max_items"` // Max number of items to scrape (total), default 10 + MaxResults uint `json:"max_results"` // Max number of results per page, default MaxItems + MaxPosts uint `json:"max_posts"` // Max number of posts per page, default 10 + MaxComments uint `json:"max_comments"` // Max number of comments per page, default 10 + MaxCommunities uint `json:"max_communities"` // Max number of communities per page, default 2 + MaxUsers uint `json:"max_users"` // Max number of users per page, default 2 + NextCursor string `json:"next_cursor"` +} + +func (r *RedditArguments) UnmarshalJSON(data []byte) error { + type Alias RedditArguments + + // Set default values. They will be overridden if present in the JSON. + r.MaxItems = redditDefaultMaxItems + r.MaxPosts = redditDefaultMaxPosts + r.MaxComments = redditDefaultMaxComments + r.MaxCommunities = redditDefaultMaxCommunities + r.MaxUsers = redditDefaultMaxUsers + r.Sort = redditDefaultSort + + aux := &struct { + *Alias + }{ + Alias: (*Alias)(r), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal Reddit arguments: %w", err) + } + + if r.MaxResults == 0 { + r.MaxResults = r.MaxItems + } + + return r.Validate() +} + +var allowedHttpMethods = util.NewSet("GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS") + +const redditDomainSuffix = "reddit.com" + +func (r *RedditArguments) Validate() error { + var errs []error + if !teetypes.AllRedditQueryTypes.Contains(r.QueryType) { + errs = append(errs, ErrRedditInvalidType) + } + + if !teetypes.AllRedditSortTypes.Contains(r.Sort) { + errs = append(errs, ErrRedditInvalidSort) + } + + if time.Now().Before(r.After) { + errs = append(errs, ErrRedditTimeInTheFuture) + } + + if r.QueryType == teetypes.RedditScrapeUrls { + if len(r.URLs) == 0 { + errs = append(errs, ErrRedditNoUrls) + } + if len(r.Queries) > 0 { + errs = append(errs, ErrRedditQueriesNotAllowed) + } + + for _, q := range r.URLs { + if !allowedHttpMethods.Contains(q.Method) { + errs = append(errs, fmt.Errorf("%s is not a valid HTTP method", q.Method)) + } + u, err := url.Parse(q.URL) + if err != nil { + errs = append(errs, fmt.Errorf("%s is not a valid URL", q.URL)) + } else { + if !strings.HasSuffix(u.Host, redditDomainSuffix) { + errs = append(errs, fmt.Errorf("Invalid Reddit URL %s", q.URL)) + } + } + } + } else { + if len(r.Queries) == 0 { + errs = append(errs, ErrRedditNoQueries) + } + if len(r.URLs) > 0 { + errs = append(errs, ErrRedditUrlsNotAllowed) + } + } + + if len(errs) > 0 { + return errors.Join(errs...) + } + + return nil +} + +// ValidateForJobType validates Twitter arguments for a specific job type +func (r *RedditArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := r.Validate(); err != nil { + return err + } + + // Validate QueryType against job-specific capabilities + return jobType.ValidateCapability(teetypes.Capability(r.QueryType)) +} + +// GetCapability returns the QueryType as a typed Capability +func (r *RedditArguments) GetCapability() teetypes.Capability { + return teetypes.Capability(r.QueryType) +} diff --git a/args/reddit_test.go b/args/reddit_test.go new file mode 100644 index 0000000..f158de3 --- /dev/null +++ b/args/reddit_test.go @@ -0,0 +1,178 @@ +package args_test + +import ( + "encoding/json" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/types" +) + +var _ = Describe("RedditArguments", func() { + Describe("Unmarshalling", func() { + It("should set default values", func() { + redditArgs := &args.RedditArguments{} + jsonData := `{"type": "searchposts", "queries": ["test"]}` + err := json.Unmarshal([]byte(jsonData), redditArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(redditArgs.MaxItems).To(Equal(uint(10))) + Expect(redditArgs.MaxPosts).To(Equal(uint(10))) + Expect(redditArgs.MaxComments).To(Equal(uint(10))) + Expect(redditArgs.MaxCommunities).To(Equal(uint(2))) + Expect(redditArgs.MaxUsers).To(Equal(uint(2))) + Expect(redditArgs.Sort).To(Equal(types.RedditSortNew)) + Expect(redditArgs.MaxResults).To(Equal(redditArgs.MaxItems)) + }) + + It("should override default values", func() { + redditArgs := &args.RedditArguments{} + jsonData := `{"type": "searchposts", "queries": ["test"], "max_items": 20, "sort": "top"}` + err := json.Unmarshal([]byte(jsonData), redditArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(redditArgs.MaxItems).To(Equal(uint(20))) + Expect(redditArgs.Sort).To(Equal(types.RedditSortTop)) + Expect(redditArgs.MaxResults).To(Equal(uint(20))) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Queries: []string{"test"}, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should succeed with valid scrapeurls arguments", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditScrapeUrls, + URLs: []types.RedditStartURL{ + {URL: "https://www.reddit.com/r/golang/", Method: "GET"}, + }, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail with an invalid type", func() { + redditArgs := &args.RedditArguments{ + QueryType: "invalidtype", + Queries: []string{"test"}, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditInvalidType)) + }) + + It("should fail with an invalid sort", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Queries: []string{"test"}, + Sort: "invalidsort", + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditInvalidSort)) + }) + + It("should fail if the after time is in the future", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Queries: []string{"test"}, + Sort: types.RedditSortNew, + After: time.Now().Add(24 * time.Hour), + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditTimeInTheFuture)) + }) + + It("should fail if queries are not provided for searchposts", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditNoQueries)) + }) + + It("should fail if urls are not provided for scrapeurls", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditScrapeUrls, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditNoUrls)) + }) + + It("should fail if queries are provided for scrapeurls", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditScrapeUrls, + Queries: []string{"test"}, + URLs: []types.RedditStartURL{ + {URL: "https://www.reddit.com/r/golang/", Method: "GET"}, + }, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditQueriesNotAllowed)) + }) + + It("should fail if urls are provided for searchposts", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Queries: []string{"test"}, + URLs: []types.RedditStartURL{ + {URL: "https://www.reddit.com/r/golang/", Method: "GET"}, + }, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(MatchError(args.ErrRedditUrlsNotAllowed)) + }) + + It("should fail with an invalid URL", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditScrapeUrls, + URLs: []types.RedditStartURL{ + {URL: "ht tp://invalid-url.com", Method: "GET"}, + }, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("is not a valid URL")) + }) + + It("should fail with an invalid domain", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditScrapeUrls, + URLs: []types.RedditStartURL{ + {URL: "https://www.google.com", Method: "GET"}, + }, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("Invalid Reddit URL")) + }) + + It("should fail with an invalid HTTP method", func() { + redditArgs := &args.RedditArguments{ + QueryType: types.RedditScrapeUrls, + URLs: []types.RedditStartURL{ + {URL: "https://www.reddit.com/r/golang/", Method: "INVALID"}, + }, + Sort: types.RedditSortNew, + } + err := redditArgs.Validate() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("is not a valid HTTP method")) + }) + }) +}) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 353bf8f..15c8f4d 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -49,6 +49,12 @@ type LinkedInJobArguments interface { ValidateForJobType(jobType types.JobType) error } +// RedditJobArguments extends JobArguments for Reddit-specific methods +type RedditJobArguments interface { + JobArguments + ValidateForJobType(jobType types.JobType) error +} + // UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct // This works with both tee-indexer and tee-worker JobArguments types func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { @@ -65,6 +71,9 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum case types.LinkedInJob: return unmarshalLinkedInArguments(jobType, args) + case types.RedditJob: + return unmarshalRedditArguments(jobType, args) + case types.TelemetryJob: return &TelemetryJobArguments{}, nil @@ -132,6 +141,27 @@ func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*Li return linkedInArgs, nil } +func unmarshalRedditArguments(jobType types.JobType, args map[string]any) (*RedditArguments, error) { + redditArgs := &RedditArguments{} + if err := unmarshalToStruct(args, redditArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal Reddit job arguments: %w", err) + } + + // If no QueryType is specified, use the default capability for this job type + if redditArgs.QueryType == "" { + if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { + redditArgs.QueryType = types.RedditQueryType(defaultCap) + } + } + + // Perform job-type-specific validation for Reddit + if err := redditArgs.ValidateForJobType(jobType); err != nil { + return nil, fmt.Errorf("reddit job validation failed: %w", err) + } + + return redditArgs, nil +} + // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal // This provides the same functionality as the existing JobArguments.Unmarshal methods func unmarshalToStruct(args map[string]any, target any) error { diff --git a/args/unmarshaller_test.go b/args/unmarshaller_test.go new file mode 100644 index 0000000..04e784f --- /dev/null +++ b/args/unmarshaller_test.go @@ -0,0 +1,105 @@ +package args_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/types" +) + +var _ = Describe("Unmarshaller", func() { + Describe("UnmarshalJobArguments", func() { + Context("with a WebJob", func() { + It("should unmarshal the arguments correctly", func() { + argsMap := map[string]any{ + "url": "https://example.com", + "selector": "h1", + "max_depth": 2, + } + jobArgs, err := args.UnmarshalJobArguments(types.WebJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + webArgs, ok := jobArgs.(*args.WebSearchArguments) + Expect(ok).To(BeTrue()) + Expect(webArgs.URL).To(Equal("https://example.com")) + Expect(webArgs.Selector).To(Equal("h1")) + Expect(webArgs.MaxDepth).To(Equal(2)) + }) + }) + + Context("with a TiktokJob", func() { + It("should unmarshal the arguments correctly", func() { + argsMap := map[string]any{ + "video_url": "https://www.tiktok.com/@user/video/123", + "language": "en-us", + } + jobArgs, err := args.UnmarshalJobArguments(types.TiktokJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + tiktokArgs, ok := jobArgs.(*args.TikTokTranscriptionArguments) + Expect(ok).To(BeTrue()) + Expect(tiktokArgs.VideoURL).To(Equal("https://www.tiktok.com/@user/video/123")) + Expect(tiktokArgs.Language).To(Equal("en-us")) + }) + }) + + Context("with a TwitterJob", func() { + It("should unmarshal the arguments correctly", func() { + argsMap := map[string]any{ + "type": "searchbyquery", + "query": "golang", + "count": 10, + } + jobArgs, err := args.UnmarshalJobArguments(types.TwitterJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + twitterArgs, ok := jobArgs.(*args.TwitterSearchArguments) + Expect(ok).To(BeTrue()) + Expect(twitterArgs.QueryType).To(Equal("searchbyquery")) + Expect(twitterArgs.Query).To(Equal("golang")) + Expect(twitterArgs.Count).To(Equal(10)) + }) + + It("should set the default capability for TwitterApifyJob", func() { + argsMap := map[string]any{"query": "masa-finance"} + jobArgs, err := args.UnmarshalJobArguments(types.TwitterApifyJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + twitterArgs, ok := jobArgs.(*args.TwitterSearchArguments) + Expect(ok).To(BeTrue()) + Expect(twitterArgs.GetCapability()).To(Equal(types.CapGetFollowers)) + }) + }) + + Context("with a RedditJob", func() { + It("should unmarshal the arguments correctly", func() { + argsMap := map[string]any{ + "type": "searchposts", + "queries": []string{"golang"}, + "sort": "new", + } + jobArgs, err := args.UnmarshalJobArguments(types.RedditJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + redditArgs, ok := jobArgs.(*args.RedditArguments) + Expect(ok).To(BeTrue()) + Expect(redditArgs.QueryType).To(Equal(types.RedditQueryType("searchposts"))) + }) + }) + + Context("with a TelemetryJob", func() { + It("should return a TelemetryJobArguments struct", func() { + argsMap := map[string]any{} + jobArgs, err := args.UnmarshalJobArguments(types.TelemetryJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + _, ok := jobArgs.(*args.TelemetryJobArguments) + Expect(ok).To(BeTrue()) + }) + }) + + Context("with an unknown job type", func() { + It("should return an error", func() { + argsMap := map[string]any{} + _, err := args.UnmarshalJobArguments("unknown", argsMap) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("unknown job type")) + }) + }) + }) +}) diff --git a/pkg/util/set.go b/pkg/util/set.go index fe237be..33b907d 100644 --- a/pkg/util/set.go +++ b/pkg/util/set.go @@ -23,17 +23,19 @@ func (s *Set[T]) Contains(item T) bool { } // Add inserts the given items into the set, deduplicating them. -func (s *Set[T]) Add(items ...T) { +func (s *Set[T]) Add(items ...T) *Set[T] { for _, item := range items { (*s)[item] = struct{}{} } + return s } // Delete removes the given items from the set if it contains them. -func (s *Set[T]) Delete(items ...T) { +func (s *Set[T]) Delete(items ...T) *Set[T] { for _, item := range items { delete((*s), item) } + return s } // Length returns the number of items in the set. diff --git a/types/jobs.go b/types/jobs.go index 945eb63..9c1cce5 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -50,6 +50,7 @@ const ( TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify LinkedInJob JobType = "linkedin" // LinkedIn scraping, keeping for unmarshalling logic + RedditJob JobType = "reddit" // Reddit scraping with Apify ) // Capability constants - typed to prevent typos and enable discoverability @@ -73,7 +74,13 @@ const ( CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" CapGetProfile Capability = "getprofile" // LinkedIn get profile capability - CapEmpty Capability = "" + // Reddit capabilities + CapScrapeUrls Capability = "scrapeurls" + CapSearchPosts Capability = "searchposts" + CapSearchUsers Capability = "searchusers" + CapSearchCommunities Capability = "searchcommunities" + + CapEmpty Capability = "" ) // Capability group constants for easy reuse @@ -104,6 +111,9 @@ var ( // TwitterApifyCaps are Twitter capabilities available with Apify TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing, CapEmpty} + + // RedditCaps are all the Reddit capabilities (only available with Apify) + RedditCaps = []Capability{CapScrapeUrls, CapSearchPosts, CapSearchUsers, CapSearchCommunities} ) // JobCapabilityMap defines which capabilities are valid for each job type @@ -128,6 +138,9 @@ var JobCapabilityMap = map[JobType][]Capability{ // TikTok job capabilities TiktokJob: AlwaysAvailableTiktokCaps, + // Reddit job capabilities + RedditJob: RedditCaps, + // Telemetry job capabilities TelemetryJob: AlwaysAvailableTelemetryCaps, } @@ -140,5 +153,6 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterApifyJob: CapGetFollowers, WebJob: CapScraper, TiktokJob: CapTranscription, + RedditJob: CapScrapeUrls, TelemetryJob: CapTelemetry, } diff --git a/types/reddit.go b/types/reddit.go new file mode 100644 index 0000000..7b2ed5e --- /dev/null +++ b/types/reddit.go @@ -0,0 +1,190 @@ +package types + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/masa-finance/tee-types/pkg/util" +) + +type RedditQueryType string + +const ( + RedditScrapeUrls RedditQueryType = "scrapeurls" + RedditSearchPosts RedditQueryType = "searchposts" + RedditSearchUsers RedditQueryType = "searchusers" + RedditSearchCommunities RedditQueryType = "searchcommunities" +) + +var AllRedditQueryTypes = util.NewSet(RedditScrapeUrls, RedditSearchPosts, RedditSearchUsers, RedditSearchCommunities) + +type RedditSortType string + +const ( + RedditSortRelevance RedditSortType = "relevance" + RedditSortHot RedditSortType = "hot" + RedditSortTop RedditSortType = "top" + RedditSortNew RedditSortType = "new" + RedditSortRising RedditSortType = "rising" + RedditSortComments RedditSortType = "comments" +) + +var AllRedditSortTypes = util.NewSet( + RedditSortRelevance, + RedditSortHot, + RedditSortTop, + RedditSortNew, + RedditSortRising, + RedditSortComments, +) + +// RedditStartURL represents a single start URL for the Apify Reddit scraper. +type RedditStartURL struct { + URL string `json:"url"` + Method string `json:"method"` +} + +type RedditResponseType string + +const ( + RedditUserResponse RedditResponseType = "user" + RedditPostResponse RedditResponseType = "post" + RedditCommentResponse RedditResponseType = "comment" + RedditCommunityResponse RedditResponseType = "community" +) + +// RedditUser represents the data structure for a Reddit user from the Apify scraper. +type RedditUser struct { + ID string `json:"id"` + URL string `json:"url"` + Username string `json:"username"` + UserIcon string `json:"userIcon"` + PostKarma int `json:"postKarma"` + CommentKarma int `json:"commentKarma"` + Description string `json:"description"` + Over18 bool `json:"over18"` + CreatedAt time.Time `json:"createdAt"` + ScrapedAt time.Time `json:"scrapedAt"` + DataType string `json:"dataType"` +} + +// RedditPost represents the data structure for a Reddit post from the Apify scraper. +type RedditPost struct { + ID string `json:"id"` + ParsedID string `json:"parsedId"` + URL string `json:"url"` + Username string `json:"username"` + Title string `json:"title"` + CommunityName string `json:"communityName"` + ParsedCommunityName string `json:"parsedCommunityName"` + Body string `json:"body"` + HTML *string `json:"html"` + NumberOfComments int `json:"numberOfComments"` + UpVotes int `json:"upVotes"` + IsVideo bool `json:"isVideo"` + IsAd bool `json:"isAd"` + Over18 bool `json:"over18"` + CreatedAt time.Time `json:"createdAt"` + ScrapedAt time.Time `json:"scrapedAt"` + DataType string `json:"dataType"` +} + +// RedditComment represents the data structure for a Reddit comment from the Apify scraper. +type RedditComment struct { + ID string `json:"id"` + ParsedID string `json:"parsedId"` + URL string `json:"url"` + ParentID string `json:"parentId"` + Username string `json:"username"` + Category string `json:"category"` + CommunityName string `json:"communityName"` + Body string `json:"body"` + CreatedAt time.Time `json:"createdAt"` + ScrapedAt time.Time `json:"scrapedAt"` + UpVotes int `json:"upVotes"` + NumberOfReplies int `json:"numberOfreplies"` + HTML string `json:"html"` + DataType string `json:"dataType"` +} + +// RedditCommunity represents the data structure for a Reddit community from the Apify scraper. +type RedditCommunity struct { + ID string `json:"id"` + Name string `json:"name"` + Title string `json:"title"` + HeaderImage string `json:"headerImage"` + Description string `json:"description"` + Over18 bool `json:"over18"` + CreatedAt time.Time `json:"createdAt"` + ScrapedAt time.Time `json:"scrapedAt"` + NumberOfMembers int `json:"numberOfMembers"` + URL string `json:"url"` + DataType string `json:"dataType"` +} + +type RedditTypeSwitch struct { + Type RedditResponseType `json:"type"` +} + +type RedditResponse struct { + TypeSwitch *RedditTypeSwitch + User *RedditUser + Post *RedditPost + Comment *RedditComment + Community *RedditCommunity +} + +func (t *RedditResponse) UnmarshalJSON(data []byte) error { + t.TypeSwitch = &RedditTypeSwitch{} + if err := json.Unmarshal(data, &t.TypeSwitch); err != nil { + return fmt.Errorf("failed to unmarshal reddit response type: %w", err) + } + + switch t.TypeSwitch.Type { + case RedditUserResponse: + t.User = &RedditUser{} + if err := json.Unmarshal(data, t.User); err != nil { + return fmt.Errorf("failed to unmarshal reddit user: %w", err) + } + case RedditPostResponse: + t.Post = &RedditPost{} + if err := json.Unmarshal(data, t.Post); err != nil { + return fmt.Errorf("failed to unmarshal reddit post: %w", err) + } + case RedditCommentResponse: + t.Comment = &RedditComment{} + if err := json.Unmarshal(data, t.Comment); err != nil { + return fmt.Errorf("failed to unmarshal reddit comment: %w", err) + } + case RedditCommunityResponse: + t.Community = &RedditCommunity{} + if err := json.Unmarshal(data, t.Community); err != nil { + return fmt.Errorf("failed to unmarshal reddit community: %w", err) + } + default: + return fmt.Errorf("unknown Reddit response type: %s", t.TypeSwitch.Type) + } + return nil +} + +// MarshalJSON implements the json.Marshaler interface for RedditResponse. +// It unwraps the inner struct (User, Post, Comment, or Community) and marshals it directly. +func (t *RedditResponse) MarshalJSON() ([]byte, error) { + if t.TypeSwitch == nil { + return []byte("null"), nil + } + + switch t.TypeSwitch.Type { + case RedditUserResponse: + return json.Marshal(t.User) + case RedditPostResponse: + return json.Marshal(t.Post) + case RedditCommentResponse: + return json.Marshal(t.Comment) + case RedditCommunityResponse: + return json.Marshal(t.Community) + default: + return nil, fmt.Errorf("unknown Reddit response type: %s", t.TypeSwitch.Type) + } +} diff --git a/types/reddit_test.go b/types/reddit_test.go new file mode 100644 index 0000000..05a7173 --- /dev/null +++ b/types/reddit_test.go @@ -0,0 +1,93 @@ +package types_test + +import ( + "encoding/json" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/types" +) + +var _ = Describe("RedditResponse", func() { + Describe("Unmarshalling", func() { + It("should unmarshal a user response", func() { + jsonData := `{"type": "user", "id": "user123", "username": "testuser"}` + var resp types.RedditResponse + err := json.Unmarshal([]byte(jsonData), &resp) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.User).ToNot(BeNil()) + Expect(resp.Post).To(BeNil()) + Expect(resp.User.ID).To(Equal("user123")) + Expect(resp.User.Username).To(Equal("testuser")) + }) + + It("should unmarshal a post response", func() { + jsonData := `{"type": "post", "id": "post123", "title": "Test Post"}` + var resp types.RedditResponse + err := json.Unmarshal([]byte(jsonData), &resp) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Post).ToNot(BeNil()) + Expect(resp.User).To(BeNil()) + Expect(resp.Post.ID).To(Equal("post123")) + Expect(resp.Post.Title).To(Equal("Test Post")) + }) + + It("should return an error for an unknown type", func() { + jsonData := `{"type": "unknown", "id": "123"}` + var resp types.RedditResponse + err := json.Unmarshal([]byte(jsonData), &resp) + Expect(err).To(MatchError("unknown Reddit response type: unknown")) + }) + }) + + Describe("Marshalling", func() { + It("should marshal a user response", func() { + now := time.Now() + resp := types.RedditResponse{ + TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserResponse}, + User: &types.RedditUser{ + ID: "user123", + Username: "testuser", + CreatedAt: now, + CommentKarma: 10, + }, + } + + expectedJSON, err := json.Marshal(resp.User) + Expect(err).ToNot(HaveOccurred()) + + actualJSON, err := json.Marshal(&resp) + Expect(err).ToNot(HaveOccurred()) + + Expect(actualJSON).To(MatchJSON(expectedJSON)) + }) + + It("should marshal a post response", func() { + resp := types.RedditResponse{ + TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditPostResponse}, + Post: &types.RedditPost{ + ID: "post123", + Title: "Test Post", + }, + } + + expectedJSON, err := json.Marshal(resp.Post) + Expect(err).ToNot(HaveOccurred()) + + actualJSON, err := json.Marshal(&resp) + Expect(err).ToNot(HaveOccurred()) + + Expect(actualJSON).To(MatchJSON(expectedJSON)) + }) + + It("should return an error for an unknown type", func() { + resp := types.RedditResponse{ + TypeSwitch: &types.RedditTypeSwitch{Type: "unknown"}, + } + _, err := json.Marshal(&resp) + Expect(err).To(HaveOccurred()) + }) + }) +}) diff --git a/types/types_suite_test.go b/types/types_suite_test.go new file mode 100644 index 0000000..3356638 --- /dev/null +++ b/types/types_suite_test.go @@ -0,0 +1,13 @@ +package types_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestTypes(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Types Suite") +} From 3a5a22710aa33bb769235c8c5b71c1ea7ab85aad Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 13:30:38 +0200 Subject: [PATCH 073/136] lint --- args/reddit.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/reddit.go b/args/reddit.go index fad60f6..0d60bb6 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -114,7 +114,7 @@ func (r *RedditArguments) Validate() error { errs = append(errs, fmt.Errorf("%s is not a valid URL", q.URL)) } else { if !strings.HasSuffix(u.Host, redditDomainSuffix) { - errs = append(errs, fmt.Errorf("Invalid Reddit URL %s", q.URL)) + errs = append(errs, fmt.Errorf("invalid Reddit URL %s", q.URL)) } } } From 49f78bf111b2e2d672527a0e96465df04978a297 Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 13:33:54 +0200 Subject: [PATCH 074/136] test fix --- args/reddit_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/reddit_test.go b/args/reddit_test.go index f158de3..525b032 100644 --- a/args/reddit_test.go +++ b/args/reddit_test.go @@ -159,7 +159,7 @@ var _ = Describe("RedditArguments", func() { } err := redditArgs.Validate() Expect(err).To(HaveOccurred()) - Expect(err.Error()).To(ContainSubstring("Invalid Reddit URL")) + Expect(err.Error()).To(ContainSubstring("invalid Reddit URL")) }) It("should fail with an invalid HTTP method", func() { From edd7c8e0937d6a99b6ecfda66c938e7feb29f4df Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 13:59:56 +0200 Subject: [PATCH 075/136] Fix Reddit args validation --- args/reddit.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/args/reddit.go b/args/reddit.go index 0d60bb6..f7842e3 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -97,6 +97,10 @@ func (r *RedditArguments) Validate() error { errs = append(errs, ErrRedditTimeInTheFuture) } + if len(errs) > 0 { + return errors.Join(errs...) + } + if r.QueryType == teetypes.RedditScrapeUrls { if len(r.URLs) == 0 { errs = append(errs, ErrRedditNoUrls) From 570c8fb39313b1ee326df9d553b4df8546bd62a8 Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 18:52:53 +0200 Subject: [PATCH 076/136] Add default URL method --- args/reddit.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/args/reddit.go b/args/reddit.go index f7842e3..6dafa8c 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -110,6 +110,10 @@ func (r *RedditArguments) Validate() error { } for _, q := range r.URLs { + q.Method = strings.ToUpper(q.Method) + if q.Method == "" { + q.Method = "GET" + } if !allowedHttpMethods.Contains(q.Method) { errs = append(errs, fmt.Errorf("%s is not a valid HTTP method", q.Method)) } From 06dd40db19ea7a7bffa17236524d8601a63c276d Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 18:55:09 +0200 Subject: [PATCH 077/136] Lowercase query type and sort --- args/reddit.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/args/reddit.go b/args/reddit.go index 6dafa8c..1c7fe53 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -85,10 +85,12 @@ const redditDomainSuffix = "reddit.com" func (r *RedditArguments) Validate() error { var errs []error + r.QueryType = teetypes.RedditQueryType(strings.ToLower(string(r.QueryType))) if !teetypes.AllRedditQueryTypes.Contains(r.QueryType) { errs = append(errs, ErrRedditInvalidType) } + r.Sort = teetypes.RedditSortType(strings.ToLower(string(r.Sort))) if !teetypes.AllRedditSortTypes.Contains(r.Sort) { errs = append(errs, ErrRedditInvalidSort) } From 3cac2f13bc757d2af855596a89aebc1a9ad1af2b Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 19:27:34 +0200 Subject: [PATCH 078/136] Some renaming of the Reddit types, and adding RedditResult --- types/reddit.go | 42 ++++++++++++++++++++++++------------------ types/reddit_test.go | 16 ++++++++-------- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/types/reddit.go b/types/reddit.go index 7b2ed5e..ea4a9d3 100644 --- a/types/reddit.go +++ b/types/reddit.go @@ -39,19 +39,25 @@ var AllRedditSortTypes = util.NewSet( RedditSortComments, ) +// RedditResult represents the response sent back from a Reddit query +type RedditResult struct { + Items []*RedditItem `json:"items"` + NextCursor string `json:"next_cursor"` +} + // RedditStartURL represents a single start URL for the Apify Reddit scraper. type RedditStartURL struct { URL string `json:"url"` Method string `json:"method"` } -type RedditResponseType string +type RedditItemType string const ( - RedditUserResponse RedditResponseType = "user" - RedditPostResponse RedditResponseType = "post" - RedditCommentResponse RedditResponseType = "comment" - RedditCommunityResponse RedditResponseType = "community" + RedditUserItem RedditItemType = "user" + RedditPostItem RedditItemType = "post" + RedditCommentItem RedditItemType = "comment" + RedditCommunityItem RedditItemType = "community" ) // RedditUser represents the data structure for a Reddit user from the Apify scraper. @@ -124,10 +130,10 @@ type RedditCommunity struct { } type RedditTypeSwitch struct { - Type RedditResponseType `json:"type"` + Type RedditItemType `json:"type"` } -type RedditResponse struct { +type RedditItem struct { TypeSwitch *RedditTypeSwitch User *RedditUser Post *RedditPost @@ -135,29 +141,29 @@ type RedditResponse struct { Community *RedditCommunity } -func (t *RedditResponse) UnmarshalJSON(data []byte) error { +func (t *RedditItem) UnmarshalJSON(data []byte) error { t.TypeSwitch = &RedditTypeSwitch{} if err := json.Unmarshal(data, &t.TypeSwitch); err != nil { return fmt.Errorf("failed to unmarshal reddit response type: %w", err) } switch t.TypeSwitch.Type { - case RedditUserResponse: + case RedditUserItem: t.User = &RedditUser{} if err := json.Unmarshal(data, t.User); err != nil { return fmt.Errorf("failed to unmarshal reddit user: %w", err) } - case RedditPostResponse: + case RedditPostItem: t.Post = &RedditPost{} if err := json.Unmarshal(data, t.Post); err != nil { return fmt.Errorf("failed to unmarshal reddit post: %w", err) } - case RedditCommentResponse: + case RedditCommentItem: t.Comment = &RedditComment{} if err := json.Unmarshal(data, t.Comment); err != nil { return fmt.Errorf("failed to unmarshal reddit comment: %w", err) } - case RedditCommunityResponse: + case RedditCommunityItem: t.Community = &RedditCommunity{} if err := json.Unmarshal(data, t.Community); err != nil { return fmt.Errorf("failed to unmarshal reddit community: %w", err) @@ -168,21 +174,21 @@ func (t *RedditResponse) UnmarshalJSON(data []byte) error { return nil } -// MarshalJSON implements the json.Marshaler interface for RedditResponse. +// MarshalJSON implements the json.Marshaller interface for RedditResponse. // It unwraps the inner struct (User, Post, Comment, or Community) and marshals it directly. -func (t *RedditResponse) MarshalJSON() ([]byte, error) { +func (t *RedditItem) MarshalJSON() ([]byte, error) { if t.TypeSwitch == nil { return []byte("null"), nil } switch t.TypeSwitch.Type { - case RedditUserResponse: + case RedditUserItem: return json.Marshal(t.User) - case RedditPostResponse: + case RedditPostItem: return json.Marshal(t.Post) - case RedditCommentResponse: + case RedditCommentItem: return json.Marshal(t.Comment) - case RedditCommunityResponse: + case RedditCommunityItem: return json.Marshal(t.Community) default: return nil, fmt.Errorf("unknown Reddit response type: %s", t.TypeSwitch.Type) diff --git a/types/reddit_test.go b/types/reddit_test.go index 05a7173..c6cf7a3 100644 --- a/types/reddit_test.go +++ b/types/reddit_test.go @@ -14,7 +14,7 @@ var _ = Describe("RedditResponse", func() { Describe("Unmarshalling", func() { It("should unmarshal a user response", func() { jsonData := `{"type": "user", "id": "user123", "username": "testuser"}` - var resp types.RedditResponse + var resp types.RedditItem err := json.Unmarshal([]byte(jsonData), &resp) Expect(err).ToNot(HaveOccurred()) Expect(resp.User).ToNot(BeNil()) @@ -25,7 +25,7 @@ var _ = Describe("RedditResponse", func() { It("should unmarshal a post response", func() { jsonData := `{"type": "post", "id": "post123", "title": "Test Post"}` - var resp types.RedditResponse + var resp types.RedditItem err := json.Unmarshal([]byte(jsonData), &resp) Expect(err).ToNot(HaveOccurred()) Expect(resp.Post).ToNot(BeNil()) @@ -36,7 +36,7 @@ var _ = Describe("RedditResponse", func() { It("should return an error for an unknown type", func() { jsonData := `{"type": "unknown", "id": "123"}` - var resp types.RedditResponse + var resp types.RedditItem err := json.Unmarshal([]byte(jsonData), &resp) Expect(err).To(MatchError("unknown Reddit response type: unknown")) }) @@ -45,8 +45,8 @@ var _ = Describe("RedditResponse", func() { Describe("Marshalling", func() { It("should marshal a user response", func() { now := time.Now() - resp := types.RedditResponse{ - TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserResponse}, + resp := types.RedditItem{ + TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserItem}, User: &types.RedditUser{ ID: "user123", Username: "testuser", @@ -65,8 +65,8 @@ var _ = Describe("RedditResponse", func() { }) It("should marshal a post response", func() { - resp := types.RedditResponse{ - TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditPostResponse}, + resp := types.RedditItem{ + TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditPostItem}, Post: &types.RedditPost{ ID: "post123", Title: "Test Post", @@ -83,7 +83,7 @@ var _ = Describe("RedditResponse", func() { }) It("should return an error for an unknown type", func() { - resp := types.RedditResponse{ + resp := types.RedditItem{ TypeSwitch: &types.RedditTypeSwitch{Type: "unknown"}, } _, err := json.Marshal(&resp) From 63523ac82ad020359e88dc384af2002cba4b04fb Mon Sep 17 00:00:00 2001 From: mcamou Date: Thu, 21 Aug 2025 19:30:53 +0200 Subject: [PATCH 079/136] Remove RedditResult (the cursor is in JobResult) --- types/reddit.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/types/reddit.go b/types/reddit.go index ea4a9d3..296557e 100644 --- a/types/reddit.go +++ b/types/reddit.go @@ -39,12 +39,6 @@ var AllRedditSortTypes = util.NewSet( RedditSortComments, ) -// RedditResult represents the response sent back from a Reddit query -type RedditResult struct { - Items []*RedditItem `json:"items"` - NextCursor string `json:"next_cursor"` -} - // RedditStartURL represents a single start URL for the Apify Reddit scraper. type RedditStartURL struct { URL string `json:"url"` From e480a88219291c81b9509c24a68ee2334d9612d1 Mon Sep 17 00:00:00 2001 From: mcamou Date: Fri, 22 Aug 2025 11:39:51 +0200 Subject: [PATCH 080/136] PR comments --- args/linkedin.go | 1 + args/reddit.go | 16 ++++++---------- args/tiktok.go | 1 + args/twitter.go | 1 + args/web.go | 1 + 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/args/linkedin.go b/args/linkedin.go index b7df066..dc3ba93 100644 --- a/args/linkedin.go +++ b/args/linkedin.go @@ -21,6 +21,7 @@ type LinkedInArguments struct { // UnmarshalJSON implements custom JSON unmarshaling with validation func (l *LinkedInArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias LinkedInArguments aux := &struct { *Alias diff --git a/args/reddit.go b/args/reddit.go index 1c7fe53..05bc694 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -52,8 +52,6 @@ type RedditArguments struct { } func (r *RedditArguments) UnmarshalJSON(data []byte) error { - type Alias RedditArguments - // Set default values. They will be overridden if present in the JSON. r.MaxItems = redditDefaultMaxItems r.MaxPosts = redditDefaultMaxPosts @@ -62,6 +60,8 @@ func (r *RedditArguments) UnmarshalJSON(data []byte) error { r.MaxUsers = redditDefaultMaxUsers r.Sort = redditDefaultSort + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) + type Alias RedditArguments aux := &struct { *Alias }{ @@ -111,10 +111,10 @@ func (r *RedditArguments) Validate() error { errs = append(errs, ErrRedditQueriesNotAllowed) } - for _, q := range r.URLs { - q.Method = strings.ToUpper(q.Method) + for i, q := range r.URLs { + r.URLs[i].Method = strings.ToUpper(q.Method) if q.Method == "" { - q.Method = "GET" + r.URLs[i].Method = "GET" } if !allowedHttpMethods.Contains(q.Method) { errs = append(errs, fmt.Errorf("%s is not a valid HTTP method", q.Method)) @@ -137,11 +137,7 @@ func (r *RedditArguments) Validate() error { } } - if len(errs) > 0 { - return errors.Join(errs...) - } - - return nil + return errors.Join(errs...) } // ValidateForJobType validates Twitter arguments for a specific job type diff --git a/args/tiktok.go b/args/tiktok.go index 6c487e6..5f3687d 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -18,6 +18,7 @@ type TikTokTranscriptionArguments struct { // UnmarshalJSON implements custom JSON unmarshaling with validation func (t *TikTokTranscriptionArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias TikTokTranscriptionArguments aux := &struct { *Alias diff --git a/args/twitter.go b/args/twitter.go index a32f024..18c6773 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -21,6 +21,7 @@ type TwitterSearchArguments struct { // UnmarshalJSON implements custom JSON unmarshaling with validation func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias TwitterSearchArguments aux := &struct { *Alias diff --git a/args/web.go b/args/web.go index f1f473f..33a466d 100644 --- a/args/web.go +++ b/args/web.go @@ -18,6 +18,7 @@ type WebSearchArguments struct { // UnmarshalJSON implements custom JSON unmarshaling with validation func (w *WebSearchArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias WebSearchArguments aux := &struct { *Alias From 52f564464188cec747f28f50a0953ddb8d446b9b Mon Sep 17 00:00:00 2001 From: mcamou Date: Mon, 25 Aug 2025 18:45:45 +0200 Subject: [PATCH 081/136] Fix Reddit unmarshal: zero values are undistinguishable from null --- args/reddit.go | 31 +++++++++++++++++++++++-------- args/reddit_test.go | 35 +++++++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/args/reddit.go b/args/reddit.go index 05bc694..d7d26fe 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -52,14 +52,6 @@ type RedditArguments struct { } func (r *RedditArguments) UnmarshalJSON(data []byte) error { - // Set default values. They will be overridden if present in the JSON. - r.MaxItems = redditDefaultMaxItems - r.MaxPosts = redditDefaultMaxPosts - r.MaxComments = redditDefaultMaxComments - r.MaxCommunities = redditDefaultMaxCommunities - r.MaxUsers = redditDefaultMaxUsers - r.Sort = redditDefaultSort - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias RedditArguments aux := &struct { @@ -85,6 +77,29 @@ const redditDomainSuffix = "reddit.com" func (r *RedditArguments) Validate() error { var errs []error + + if r.MaxItems == 0 { + r.MaxItems = redditDefaultMaxItems + } + if r.MaxPosts == 0 { + r.MaxPosts = redditDefaultMaxPosts + } + if r.MaxComments == 0 { + r.MaxComments = redditDefaultMaxComments + } + if r.MaxCommunities == 0 { + r.MaxCommunities = redditDefaultMaxCommunities + } + if r.MaxUsers == 0 { + r.MaxUsers = redditDefaultMaxUsers + } + if r.MaxResults == 0 { + r.MaxResults = r.MaxItems + } + if r.Sort == "" { + r.Sort = redditDefaultSort + } + r.QueryType = teetypes.RedditQueryType(strings.ToLower(string(r.QueryType))) if !teetypes.AllRedditQueryTypes.Contains(r.QueryType) { errs = append(errs, ErrRedditInvalidType) diff --git a/args/reddit_test.go b/args/reddit_test.go index 525b032..4a6666a 100644 --- a/args/reddit_test.go +++ b/args/reddit_test.go @@ -12,11 +12,15 @@ import ( ) var _ = Describe("RedditArguments", func() { - Describe("Unmarshalling", func() { + Describe("Marshalling and unmarshalling", func() { It("should set default values", func() { - redditArgs := &args.RedditArguments{} - jsonData := `{"type": "searchposts", "queries": ["test"]}` - err := json.Unmarshal([]byte(jsonData), redditArgs) + redditArgs := args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Queries: []string{"Zaphod", "Ford"}, + } + jsonData, err := json.Marshal(redditArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &redditArgs) Expect(err).ToNot(HaveOccurred()) Expect(redditArgs.MaxItems).To(Equal(uint(10))) Expect(redditArgs.MaxPosts).To(Equal(uint(10))) @@ -28,14 +32,29 @@ var _ = Describe("RedditArguments", func() { }) It("should override default values", func() { - redditArgs := &args.RedditArguments{} - jsonData := `{"type": "searchposts", "queries": ["test"], "max_items": 20, "sort": "top"}` - err := json.Unmarshal([]byte(jsonData), redditArgs) + redditArgs := args.RedditArguments{ + QueryType: types.RedditSearchPosts, + Queries: []string{"Zaphod", "Ford"}, + MaxItems: 20, + MaxPosts: 21, + MaxComments: 22, + MaxCommunities: 23, + MaxUsers: 24, + Sort: types.RedditSortTop, + } + jsonData, err := json.Marshal(redditArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &redditArgs) Expect(err).ToNot(HaveOccurred()) Expect(redditArgs.MaxItems).To(Equal(uint(20))) - Expect(redditArgs.Sort).To(Equal(types.RedditSortTop)) + Expect(redditArgs.MaxPosts).To(Equal(uint(21))) + Expect(redditArgs.MaxComments).To(Equal(uint(22))) + Expect(redditArgs.MaxCommunities).To(Equal(uint(23))) + Expect(redditArgs.MaxUsers).To(Equal(uint(24))) Expect(redditArgs.MaxResults).To(Equal(uint(20))) + Expect(redditArgs.Sort).To(Equal(types.RedditSortTop)) }) + }) Describe("Validation", func() { From 4725cd96cfceaec4b3fa0c5f825a9036fd31020c Mon Sep 17 00:00:00 2001 From: mcamou Date: Tue, 26 Aug 2025 11:00:24 +0200 Subject: [PATCH 082/136] Move default values and canonicalization to a separate method --- args/reddit.go | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/args/reddit.go b/args/reddit.go index d7d26fe..41bc17e 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -32,6 +32,10 @@ const ( redditDefaultSort = teetypes.RedditSortNew ) +const redditDomainSuffix = "reddit.com" + +var allowedHttpMethods = util.NewSet("GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS") + // RedditArguments defines args for Reddit scrapes // see https://apify.com/trudax/reddit-scraper type RedditArguments struct { @@ -64,20 +68,13 @@ func (r *RedditArguments) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal Reddit arguments: %w", err) } - if r.MaxResults == 0 { - r.MaxResults = r.MaxItems - } + r.setDefaultValues() return r.Validate() } -var allowedHttpMethods = util.NewSet("GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS") - -const redditDomainSuffix = "reddit.com" - -func (r *RedditArguments) Validate() error { - var errs []error - +// setDefaultValues sets the default values for the parameters that were not provided and canonicalizes the strings for later validation +func (r *RedditArguments) setDefaultValues() { if r.MaxItems == 0 { r.MaxItems = redditDefaultMaxItems } @@ -101,11 +98,25 @@ func (r *RedditArguments) Validate() error { } r.QueryType = teetypes.RedditQueryType(strings.ToLower(string(r.QueryType))) + r.Sort = teetypes.RedditSortType(strings.ToLower(string(r.Sort))) + + if r.QueryType == teetypes.RedditScrapeUrls { + for i, q := range r.URLs { + r.URLs[i].Method = strings.ToUpper(q.Method) + if q.Method == "" { + r.URLs[i].Method = "GET" + } + } + } +} + +func (r *RedditArguments) Validate() error { + var errs []error + if !teetypes.AllRedditQueryTypes.Contains(r.QueryType) { errs = append(errs, ErrRedditInvalidType) } - r.Sort = teetypes.RedditSortType(strings.ToLower(string(r.Sort))) if !teetypes.AllRedditSortTypes.Contains(r.Sort) { errs = append(errs, ErrRedditInvalidSort) } @@ -126,11 +137,7 @@ func (r *RedditArguments) Validate() error { errs = append(errs, ErrRedditQueriesNotAllowed) } - for i, q := range r.URLs { - r.URLs[i].Method = strings.ToUpper(q.Method) - if q.Method == "" { - r.URLs[i].Method = "GET" - } + for _, q := range r.URLs { if !allowedHttpMethods.Contains(q.Method) { errs = append(errs, fmt.Errorf("%s is not a valid HTTP method", q.Method)) } @@ -138,7 +145,7 @@ func (r *RedditArguments) Validate() error { if err != nil { errs = append(errs, fmt.Errorf("%s is not a valid URL", q.URL)) } else { - if !strings.HasSuffix(u.Host, redditDomainSuffix) { + if !strings.HasSuffix(strings.ToLower(u.Host), redditDomainSuffix) { errs = append(errs, fmt.Errorf("invalid Reddit URL %s", q.URL)) } } From 0f8ead970fa4ee1d1b09ad991417473c056aa9f9 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:01:22 +0200 Subject: [PATCH 083/136] fix: tiktok json labeling for search by query types --- types/tiktok.go | 156 ++++++++++++++++++++++++------------------------ 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/types/tiktok.go b/types/tiktok.go index 938bb2a..ed657b7 100644 --- a/types/tiktok.go +++ b/types/tiktok.go @@ -14,49 +14,49 @@ type TikTokSearchByQueryResult struct { URL string `json:"url"` ID string `json:"id"` Desc string `json:"desc"` - CreateTime string `json:"create_time"` - ScheduleTime int64 `json:"schedule_time"` + CreateTime string `json:"createTime"` + ScheduleTime int64 `json:"scheduleTime"` Video TikTokVideo `json:"video"` Author string `json:"author"` Music TikTokMusic `json:"music"` Challenges []any `json:"challenges"` // we don't have examples of this data yet... Stats TikTokStats `json:"stats"` - IsActivityItem bool `json:"is_activity_item"` - DuetInfo TikTokDuetInfo `json:"duet_info"` - WarnInfo []any `json:"warn_info"` // we don't have examples of this data yet... - OriginalItem bool `json:"original_item"` - OfficalItem bool `json:"offical_item"` - TextExtra []TikTokTextExtra `json:"text_extra"` + IsActivityItem bool `json:"isActivityItem"` + DuetInfo TikTokDuetInfo `json:"duetInfo"` + WarnInfo []any `json:"warnInfo"` // we don't have examples of this data yet... + OriginalItem bool `json:"originalItem"` + OfficialItem bool `json:"officalItem"` + TextExtra []TikTokTextExtra `json:"textExtra"` Secret bool `json:"secret"` - ForFriend bool `json:"for_friend"` + ForFriend bool `json:"forFriend"` Digged bool `json:"digged"` - ItemCommentStatus int `json:"item_comment_status"` - ShowNotPass bool `json:"show_not_pass"` + ItemCommentStatus int `json:"itemCommentStatus"` + ShowNotPass bool `json:"showNotPass"` VL1 bool `json:"vl1"` - TakeDown int `json:"take_down"` - ItemMute bool `json:"item_mute"` - EffectStickers []any `json:"effect_stickers"` // we don't have examples of this data yet... - AuthorStats TikTokAuthorStats `json:"author_stats"` - PrivateItem bool `json:"private_item"` - DuetEnabled bool `json:"duet_enabled"` - StitchEnabled bool `json:"stitch_enabled"` - StickersOnItem []any `json:"stickers_on_item"` // we don't have examples of this data yet... - IsAd bool `json:"is_ad"` - ShareEnabled bool `json:"share_enabled"` + TakeDown int `json:"takeDown"` + ItemMute bool `json:"itemMute"` + EffectStickers []any `json:"effectStickers"` // we don't have examples of this data yet... + AuthorStats TikTokAuthorStats `json:"authorStats"` + PrivateItem bool `json:"privateItem"` + DuetEnabled bool `json:"duetEnabled"` + StitchEnabled bool `json:"stitchEnabled"` + StickersOnItem []any `json:"stickersOnItem"` // we don't have examples of this data yet... + IsAd bool `json:"isAd"` + ShareEnabled bool `json:"shareEnabled"` Comments []any `json:"comments"` // we don't have examples of this data yet... - DuetDisplay int `json:"duet_display"` - StitchDisplay int `json:"stitch_display"` - IndexEnabled bool `json:"index_enabled"` - DiversificationLabels []string `json:"diversification_labels"` - AdAuthorization bool `json:"ad_authorization"` - AdLabelVersion int `json:"ad_label_version"` - LocationCreated string `json:"location_created"` + DuetDisplay int `json:"duetDisplay"` + StitchDisplay int `json:"stitchDisplay"` + IndexEnabled bool `json:"indexEnabled"` + DiversificationLabels []string `json:"diversificationLabels"` + AdAuthorization bool `json:"adAuthorization"` + AdLabelVersion int `json:"adLabelVersion"` + LocationCreated string `json:"locationCreated"` Nickname string `json:"nickname"` - AuthorID string `json:"author_id"` - AuthorSecID string `json:"author_sec_id"` - AvatarThumb string `json:"avatar_thumb"` - DownloadSetting int `json:"download_setting"` - AuthorPrivate bool `json:"author_private"` + AuthorID string `json:"authorId"` + AuthorSecID string `json:"authorSecId"` + AvatarThumb string `json:"avatarThumb"` + DownloadSetting int `json:"downloadSetting"` + AuthorPrivate bool `json:"authorPrivate"` } type TikTokSearchByTrending struct { @@ -82,18 +82,18 @@ type TikTokVideo struct { PlayAddr string `json:"play_addr"` DownloadAddr string `json:"download_addr"` ShareCover []string `json:"share_cover"` - ReflowCover string `json:"reflow_cover"` + ReflowCover string `json:"reflowCover"` Bitrate int `json:"bitrate"` - EncodedType string `json:"encoded_type"` + EncodedType string `json:"encodedType"` Format string `json:"format"` - VideoQuality string `json:"video_quality"` - EncodeUserTag string `json:"encode_user_tag"` - CodecType string `json:"codec_type"` + VideoQuality string `json:"videoQuality"` + EncodeUserTag string `json:"encodeUserTag"` + CodecType string `json:"codecType"` Definition string `json:"definition"` - SubtitleInfos []any `json:"subtitle_infos"` // we don't have examples of this data yet... - ZoomCover TikTokZoomCover `json:"zoom_cover"` - VolumeInfo TikTokVolumeInfo `json:"volume_info"` - BitrateInfo []TikTokBitrateInfo `json:"bitrate_info"` + SubtitleInfos []any `json:"subtitleInfos"` // we don't have examples of this data yet... + ZoomCover TikTokZoomCover `json:"zoomCover"` + VolumeInfo TikTokVolumeInfo `json:"volumeInfo"` + BitrateInfo []TikTokBitrateInfo `json:"bitrateInfo"` } type TikTokZoomCover struct { @@ -104,71 +104,71 @@ type TikTokZoomCover struct { } type TikTokVolumeInfo struct { - Loudness float64 `json:"loudness"` - Peak float64 `json:"peak"` + Loudness float64 `json:"Loudness"` + Peak float64 `json:"Peak"` } type TikTokBitrateInfo struct { - GearName string `json:"gear_name"` + GearName string `json:"GearName"` Bitrate int `json:"bitrate"` - QualityType int `json:"quality_type"` - PlayAddr TikTokPlayAddr `json:"play_addr"` - CodecType string `json:"codec_type"` + QualityType int `json:"QualityType"` + PlayAddr TikTokPlayAddr `json:"PlayAddr"` + CodecType string `json:"CodecType"` } type TikTokPlayAddr struct { - Uri string `json:"uri"` - UrlList []string `json:"url_list"` - DataSize string `json:"data_size"` - UrlKey string `json:"url_key"` - FileHash string `json:"file_hash"` - FileCs string `json:"file_cs"` + Uri string `json:"Uri"` + UrlList []string `json:"UrlList"` + DataSize string `json:"DataSize"` + UrlKey string `json:"UrlKey"` + FileHash string `json:"FileHash"` + FileCs string `json:"FileCs"` } type TikTokMusic struct { ID string `json:"id"` Title string `json:"title"` - PlayURL string `json:"play_url"` - CoverLarge string `json:"cover_large"` - CoverMedium string `json:"cover_medium"` - CoverThumb string `json:"cover_thumb"` - AuthorName string `json:"author_name"` + PlayURL string `json:"playUrl"` + CoverLarge string `json:"coverLarge"` + CoverMedium string `json:"coverMedium"` + CoverThumb string `json:"coverThumb"` + AuthorName string `json:"authorName"` Original bool `json:"original"` Duration int `json:"duration"` Album string `json:"album"` - ScheduleSearchTime int64 `json:"schedule_search_time"` + ScheduleSearchTime int64 `json:"scheduleSearchTime"` } type TikTokStats struct { - DiggCount int64 `json:"digg_count"` - ShareCount int64 `json:"share_count"` - CommentCount int64 `json:"comment_count"` - PlayCount int64 `json:"play_count"` + DiggCount int64 `json:"diggCount"` + ShareCount int64 `json:"shareCount"` + CommentCount int64 `json:"commentCount"` + PlayCount int64 `json:"playCount"` } type TikTokDuetInfo struct { - DuetFromID string `json:"duet_from_id"` + DuetFromID string `json:"duetFromId"` } type TikTokTextExtra struct { - AwemeID string `json:"aweme_id"` + AwemeID string `json:"awemeId"` Start int `json:"start"` End int `json:"end"` - HashtagID string `json:"hashtag_id"` - HashtagName string `json:"hashtag_name"` + HashtagID string `json:"hashtagId"` + HashtagName string `json:"hashtagName"` Type int `json:"type"` - SubType int `json:"sub_type"` - UserID string `json:"user_id"` - IsCommerce bool `json:"is_commerce"` - UserUniqueID string `json:"user_unique_id"` - SecUID string `json:"sec_uid"` + SubType int `json:"subType"` + UserID string `json:"userId"` + IsCommerce bool `json:"isCommerce"` + UserUniqueID string `json:"userUniqueId"` + SecUID string `json:"secUid"` } type TikTokAuthorStats struct { - FollowerCount int64 `json:"follower_count"` - FollowingCount int64 `json:"following_count"` + FollowerCount int64 `json:"followerCount"` + FollowingCount int64 `json:"followingCount"` Heart int64 `json:"heart"` - HeartCount int64 `json:"heart_count"` - VideoCount int64 `json:"video_count"` - DiggCount int64 `json:"digg_count"` + HeartCount int64 `json:"heartCount"` + VideoCount int64 `json:"videoCount"` + DiggCount int64 `json:"diggCount"` } From 2b084a9ce06142fe870ff2e96dc839e2bc247582 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:19:38 +0200 Subject: [PATCH 084/136] fix: constants --- args/tiktok.go | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index 36b1b3c..196a967 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -10,6 +10,19 @@ import ( teetypes "github.com/masa-finance/tee-types/types" ) +// Period constants for TikTok trending search +const ( + periodWeek string = "7" + periodMonth string = "30" +) + +const ( + sortTrending string = "vv" + sortLike string = "like" + sortComment string = "comment" + sortRepost string = "repost" +) + // TikTokTranscriptionArguments defines args for TikTok transcriptions type TikTokTranscriptionArguments struct { VideoURL string `json:"video_url"` @@ -189,7 +202,7 @@ type TikTokSearchByTrendingArguments struct { CountryCode string `json:"country_code,omitempty"` SortBy string `json:"sort_by,omitempty"` MaxItems int `json:"max_items,omitempty"` - Period string `json:"period,omitempty"` // "7" or "30" + Period string `json:"period,omitempty"` } func (t *TikTokSearchByTrendingArguments) UnmarshalJSON(data []byte) error { @@ -203,31 +216,43 @@ func (t *TikTokSearchByTrendingArguments) UnmarshalJSON(data []byte) error { t.CountryCode = "US" } if t.SortBy == "" { - t.SortBy = "vv" + t.SortBy = sortTrending } if t.Period == "" { - t.Period = "7" + t.Period = periodWeek } return t.Validate() } func (t *TikTokSearchByTrendingArguments) Validate() error { + allowedSorts := map[string]struct{}{ + sortTrending: {}, sortLike: {}, sortComment: {}, sortRepost: {}, + } + + allowedPeriods := map[string]struct{}{ + periodWeek: {}, + periodMonth: {}, + } + allowedCountries := map[string]struct{}{ "AU": {}, "BR": {}, "CA": {}, "EG": {}, "FR": {}, "DE": {}, "ID": {}, "IL": {}, "IT": {}, "JP": {}, "MY": {}, "PH": {}, "RU": {}, "SA": {}, "SG": {}, "KR": {}, "ES": {}, "TW": {}, "TH": {}, "TR": {}, "AE": {}, "GB": {}, "US": {}, "VN": {}, } + if _, ok := allowedCountries[strings.ToUpper(t.CountryCode)]; !ok { return fmt.Errorf("invalid country_code '%s'", t.CountryCode) } - allowedSorts := map[string]struct{}{ - "vv": {}, "like": {}, "comment": {}, "repost": {}, - } if _, ok := allowedSorts[strings.ToLower(t.SortBy)]; !ok { return fmt.Errorf("invalid sort_by '%s'", t.SortBy) } - if t.Period != "7" && t.Period != "30" { - return fmt.Errorf("invalid period '%s' (allowed: '7','30')", t.Period) + if _, ok := allowedPeriods[t.Period]; !ok { + // Extract keys for error message + var validKeys []string + for key := range allowedPeriods { + validKeys = append(validKeys, key) + } + return fmt.Errorf("invalid period '%s' (allowed: %s)", t.Period, strings.Join(validKeys, ", ")) } if t.MaxItems < 0 { return fmt.Errorf("max_items must be non-negative, got: %d", t.MaxItems) From 275091f74a0ad8dec71394c0d974fcfc244f80bd Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:21:26 +0200 Subject: [PATCH 085/136] chore: uint types instead of int for search by query args --- args/tiktok.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index 196a967..f2ff45b 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -154,8 +154,8 @@ type TikTokSearchByQueryArguments struct { Search []string `json:"search,omitempty"` StartUrls []string `json:"start_urls,omitempty"` - MaxItems int `json:"max_items,omitempty"` - EndPage int `json:"end_page,omitempty"` + MaxItems uint `json:"max_items,omitempty"` + EndPage uint `json:"end_page,omitempty"` Proxy *TikTokApifyProxySetting `json:"proxy,omitempty"` } From 33214de81dd11744793c27a8ee0d7a8dcd39deb9 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:25:08 +0200 Subject: [PATCH 086/136] chore: force use of proxy in tiktok --- args/tiktok.go | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index f2ff45b..1abe754 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -143,20 +143,13 @@ func (t *TikTokTranscriptionArguments) validateLanguageCode() error { return nil } -// Proxy settings used by Apify input -type TikTokApifyProxySetting struct { - UseApifyProxy bool `json:"use_apify_proxy"` -} - // TikTokSearchByQueryArguments defines args for epctex/tiktok-search-scraper type TikTokSearchByQueryArguments struct { - QueryType string `json:"type"` - - Search []string `json:"search,omitempty"` - StartUrls []string `json:"start_urls,omitempty"` - MaxItems uint `json:"max_items,omitempty"` - EndPage uint `json:"end_page,omitempty"` - Proxy *TikTokApifyProxySetting `json:"proxy,omitempty"` + QueryType string `json:"type"` + Search []string `json:"search,omitempty"` + StartUrls []string `json:"start_urls,omitempty"` + MaxItems uint `json:"max_items,omitempty"` + EndPage uint `json:"end_page,omitempty"` } func (t *TikTokSearchByQueryArguments) UnmarshalJSON(data []byte) error { @@ -166,9 +159,6 @@ func (t *TikTokSearchByQueryArguments) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) } t.QueryType = strings.ToLower(t.QueryType) - if t.Proxy == nil { - t.Proxy = &TikTokApifyProxySetting{UseApifyProxy: true} - } return t.Validate() } From 0d4c4b7fd8610d7c1e64df1e9b1a0998c91bf726 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:27:44 +0200 Subject: [PATCH 087/136] chore: adds tiktok unmarshall comment and simplifies argument validation --- args/tiktok.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/args/tiktok.go b/args/tiktok.go index 1abe754..f35784e 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -108,6 +108,7 @@ type TikTokArguments struct { } func (t *TikTokArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias TikTokArguments aux := &struct{ *Alias }{Alias: (*Alias)(t)} if err := json.Unmarshal(data, aux); err != nil { @@ -153,6 +154,7 @@ type TikTokSearchByQueryArguments struct { } func (t *TikTokSearchByQueryArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias TikTokSearchByQueryArguments aux := &struct{ *Alias }{Alias: (*Alias)(t)} if err := json.Unmarshal(data, aux); err != nil { @@ -166,12 +168,6 @@ func (t *TikTokSearchByQueryArguments) Validate() error { if len(t.Search) == 0 && len(t.StartUrls) == 0 { return errors.New("either 'search' or 'start_urls' is required for searchbyquery") } - if t.MaxItems < 0 { - return fmt.Errorf("max_items must be non-negative, got: %d", t.MaxItems) - } - if t.EndPage < 0 { - return fmt.Errorf("end_page must be non-negative, got: %d", t.EndPage) - } return nil } @@ -196,6 +192,7 @@ type TikTokSearchByTrendingArguments struct { } func (t *TikTokSearchByTrendingArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) type Alias TikTokSearchByTrendingArguments aux := &struct{ *Alias }{Alias: (*Alias)(t)} if err := json.Unmarshal(data, aux); err != nil { From 753766dc9918a8cfb8573220cd560cf0f4427cee Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:42:25 +0200 Subject: [PATCH 088/136] chore: refactor args --- args/args.go | 25 +++++++++++++++++++++++++ args/tiktok.go | 16 ---------------- args/unmarshaller.go | 4 ++-- 3 files changed, 27 insertions(+), 18 deletions(-) create mode 100644 args/args.go diff --git a/args/args.go b/args/args.go new file mode 100644 index 0000000..d928ca1 --- /dev/null +++ b/args/args.go @@ -0,0 +1,25 @@ +package args + +import ( + "encoding/json" + "fmt" + "strings" +) + +// QueryTypeArgument provides a minimal structure to extract the QueryType (json "type") +// This is used across different job types to determine the specific capability being requested +type QueryTypeArgument struct { + QueryType string `json:"type"` +} + +// UnmarshalJSON implements custom JSON unmarshaling with normalization +func (q *QueryTypeArgument) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion + type Alias QueryTypeArgument + aux := &struct{ *Alias }{Alias: (*Alias)(q)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal QueryType arguments: %w", err) + } + q.QueryType = strings.ToLower(q.QueryType) + return nil +} diff --git a/args/tiktok.go b/args/tiktok.go index f35784e..0e015c5 100644 --- a/args/tiktok.go +++ b/args/tiktok.go @@ -102,22 +102,6 @@ func (t *TikTokTranscriptionArguments) GetLanguageCode() string { return t.Language } -// TikTokArguments provides a minimal structure to extract the QueryType (json "type") -type TikTokArguments struct { - QueryType string `json:"type"` -} - -func (t *TikTokArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias TikTokArguments - aux := &struct{ *Alias }{Alias: (*Alias)(t)} - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) - } - t.QueryType = strings.ToLower(t.QueryType) - return nil -} - // ValidateForJobType validates TikTok arguments for a specific job type func (t *TikTokTranscriptionArguments) ValidateForJobType(jobType teetypes.JobType) error { if err := t.Validate(); err != nil { diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 20857f3..c1ce391 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -94,12 +94,12 @@ func unmarshalWebArguments(args map[string]any) (*WebSearchArguments, error) { func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { // Unmarshal minimally to read QueryType like we do for Twitter - minimal := &TikTokArguments{} + minimal := &QueryTypeArgument{} if err := unmarshalToStruct(args, minimal); err != nil { return nil, fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) } capability := types.Capability(strings.ToLower(minimal.QueryType)) - if capability == types.Capability("") { + if capability == types.CapEmpty { defaultCap, exists := types.JobDefaultCapabilityMap[types.TiktokJob] if !exists { return nil, fmt.Errorf("no default capability configured for job type: %s", types.TiktokJob) From 7d6f09a3acde78359b945638ad21e6e411fe7763 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:50:16 +0200 Subject: [PATCH 089/136] chore: simplifies unmarshaller and reorganizes telemetry args --- args/telemetry.go | 16 ++++++++++++++++ args/unmarshaller.go | 43 ------------------------------------------- 2 files changed, 16 insertions(+), 43 deletions(-) create mode 100644 args/telemetry.go diff --git a/args/telemetry.go b/args/telemetry.go new file mode 100644 index 0000000..08fb741 --- /dev/null +++ b/args/telemetry.go @@ -0,0 +1,16 @@ +package args + +import ( + "github.com/masa-finance/tee-types/types" +) + +// TelemetryJobArguments for telemetry jobs (simple case) +type TelemetryJobArguments struct{} + +func (t *TelemetryJobArguments) Validate() error { + return nil +} + +func (t *TelemetryJobArguments) GetCapability() types.Capability { + return types.CapTelemetry +} diff --git a/args/unmarshaller.go b/args/unmarshaller.go index c1ce391..bd44121 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -219,47 +219,4 @@ func unmarshalToStruct(args map[string]any, target any) error { return nil } -// TelemetryJobArguments for telemetry jobs (simple case) -type TelemetryJobArguments struct{} -func (t *TelemetryJobArguments) Validate() error { - return nil -} - -func (t *TelemetryJobArguments) GetCapability() types.Capability { - return types.CapTelemetry -} - -// Type assertion helpers -func AsWebArguments(args JobArguments) (*WebSearchArguments, bool) { - webArgs, ok := args.(*WebSearchArguments) - return webArgs, ok -} - -func AsTwitterArguments(args JobArguments) (TwitterJobArguments, bool) { - twitterArgs, ok := args.(*TwitterSearchArguments) - if !ok { - return nil, false - } - return twitterArgs, true -} - -// Use specific helpers for TikTok argument types: -// - AsTikTokTranscriptionArguments -// - AsTikTokSearchByQueryArguments -// - AsTikTokSearchByTrendingArguments - -func AsTikTokTranscriptionArguments(args JobArguments) (*TikTokTranscriptionArguments, bool) { - v, ok := args.(*TikTokTranscriptionArguments) - return v, ok -} - -func AsTikTokSearchByQueryArguments(args JobArguments) (*TikTokSearchByQueryArguments, bool) { - v, ok := args.(*TikTokSearchByQueryArguments) - return v, ok -} - -func AsTikTokSearchByTrendingArguments(args JobArguments) (*TikTokSearchByTrendingArguments, bool) { - v, ok := args.(*TikTokSearchByTrendingArguments) - return v, ok -} From 1fdb7ee740e6fdd5bc84cbb35e7e83e04910e4eb Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:50:36 +0200 Subject: [PATCH 090/136] chore: unmarshaller --- args/unmarshaller.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index bd44121..ce6bb49 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -218,5 +218,3 @@ func unmarshalToStruct(args map[string]any, target any) error { return nil } - - From b42bf7bb17e09d48d6a109e9630563b35eb37c33 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 26 Aug 2025 19:53:07 +0200 Subject: [PATCH 091/136] chore: change default tiktok capability --- types/jobs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types/jobs.go b/types/jobs.go index f1a4d5e..fe66292 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -159,7 +159,7 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, WebJob: CapScraper, - TiktokJob: CapSearchByQuery, + TiktokJob: CapTranscription, RedditJob: CapScrapeUrls, TelemetryJob: CapTelemetry, } From 2060f38df5d10af739255311d0d20f083a50e957 Mon Sep 17 00:00:00 2001 From: mcamou Date: Wed, 27 Aug 2025 14:31:17 +0200 Subject: [PATCH 092/136] Allow Reddit post or comment URLs only --- args/reddit.go | 54 ++++++++++++++++++++------------------------- args/reddit_test.go | 40 ++++++++++++--------------------- 2 files changed, 38 insertions(+), 56 deletions(-) diff --git a/args/reddit.go b/args/reddit.go index 41bc17e..68e73de 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -39,20 +39,20 @@ var allowedHttpMethods = util.NewSet("GET", "POST", "PUT", "DELETE", "HEAD", "OP // RedditArguments defines args for Reddit scrapes // see https://apify.com/trudax/reddit-scraper type RedditArguments struct { - QueryType teetypes.RedditQueryType `json:"type"` - Queries []string `json:"queries"` - URLs []teetypes.RedditStartURL `json:"urls"` - Sort teetypes.RedditSortType `json:"sort"` - IncludeNSFW bool `json:"include_nsfw"` - SkipPosts bool `json:"skip_posts"` // Valid only for searchusers - After time.Time `json:"after"` // valid only for scrapeurls and searchposts - MaxItems uint `json:"max_items"` // Max number of items to scrape (total), default 10 - MaxResults uint `json:"max_results"` // Max number of results per page, default MaxItems - MaxPosts uint `json:"max_posts"` // Max number of posts per page, default 10 - MaxComments uint `json:"max_comments"` // Max number of comments per page, default 10 - MaxCommunities uint `json:"max_communities"` // Max number of communities per page, default 2 - MaxUsers uint `json:"max_users"` // Max number of users per page, default 2 - NextCursor string `json:"next_cursor"` + QueryType teetypes.RedditQueryType `json:"type"` + Queries []string `json:"queries"` + URLs []string `json:"urls"` + Sort teetypes.RedditSortType `json:"sort"` + IncludeNSFW bool `json:"include_nsfw"` + SkipPosts bool `json:"skip_posts"` // Valid only for searchusers + After time.Time `json:"after"` // valid only for scrapeurls and searchposts + MaxItems uint `json:"max_items"` // Max number of items to scrape (total), default 10 + MaxResults uint `json:"max_results"` // Max number of results per page, default MaxItems + MaxPosts uint `json:"max_posts"` // Max number of posts per page, default 10 + MaxComments uint `json:"max_comments"` // Max number of comments per page, default 10 + MaxCommunities uint `json:"max_communities"` // Max number of communities per page, default 2 + MaxUsers uint `json:"max_users"` // Max number of users per page, default 2 + NextCursor string `json:"next_cursor"` } func (r *RedditArguments) UnmarshalJSON(data []byte) error { @@ -99,15 +99,6 @@ func (r *RedditArguments) setDefaultValues() { r.QueryType = teetypes.RedditQueryType(strings.ToLower(string(r.QueryType))) r.Sort = teetypes.RedditSortType(strings.ToLower(string(r.Sort))) - - if r.QueryType == teetypes.RedditScrapeUrls { - for i, q := range r.URLs { - r.URLs[i].Method = strings.ToUpper(q.Method) - if q.Method == "" { - r.URLs[i].Method = "GET" - } - } - } } func (r *RedditArguments) Validate() error { @@ -137,16 +128,19 @@ func (r *RedditArguments) Validate() error { errs = append(errs, ErrRedditQueriesNotAllowed) } - for _, q := range r.URLs { - if !allowedHttpMethods.Contains(q.Method) { - errs = append(errs, fmt.Errorf("%s is not a valid HTTP method", q.Method)) - } - u, err := url.Parse(q.URL) + for _, u := range r.URLs { + u, err := url.Parse(u) if err != nil { - errs = append(errs, fmt.Errorf("%s is not a valid URL", q.URL)) + errs = append(errs, fmt.Errorf("%s is not a valid URL", u)) } else { if !strings.HasSuffix(strings.ToLower(u.Host), redditDomainSuffix) { - errs = append(errs, fmt.Errorf("invalid Reddit URL %s", q.URL)) + errs = append(errs, fmt.Errorf("invalid Reddit URL %s", u)) + } + if !strings.HasPrefix(u.Path, "/r/") { + errs = append(errs, fmt.Errorf("%s is not a Reddit post or comment URL (missing /r/)", u)) + } + if !strings.Contains(u.Path, "/comments/") { + errs = append(errs, fmt.Errorf("%s is not a Reddit post or comment URL (missing /comments/)", u)) } } } diff --git a/args/reddit_test.go b/args/reddit_test.go index 4a6666a..251ff67 100644 --- a/args/reddit_test.go +++ b/args/reddit_test.go @@ -71,10 +71,8 @@ var _ = Describe("RedditArguments", func() { It("should succeed with valid scrapeurls arguments", func() { redditArgs := &args.RedditArguments{ QueryType: types.RedditScrapeUrls, - URLs: []types.RedditStartURL{ - {URL: "https://www.reddit.com/r/golang/", Method: "GET"}, - }, - Sort: types.RedditSortNew, + URLs: []string{"https://www.reddit.com/r/golang/comments/foo/bar"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).ToNot(HaveOccurred()) @@ -133,10 +131,8 @@ var _ = Describe("RedditArguments", func() { redditArgs := &args.RedditArguments{ QueryType: types.RedditScrapeUrls, Queries: []string{"test"}, - URLs: []types.RedditStartURL{ - {URL: "https://www.reddit.com/r/golang/", Method: "GET"}, - }, - Sort: types.RedditSortNew, + URLs: []string{"https://www.reddit.com/r/golang/comments/foo/bar/"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).To(MatchError(args.ErrRedditQueriesNotAllowed)) @@ -146,10 +142,8 @@ var _ = Describe("RedditArguments", func() { redditArgs := &args.RedditArguments{ QueryType: types.RedditSearchPosts, Queries: []string{"test"}, - URLs: []types.RedditStartURL{ - {URL: "https://www.reddit.com/r/golang/", Method: "GET"}, - }, - Sort: types.RedditSortNew, + URLs: []string{"https://www.reddit.com/r/golang/comments/foo/bar"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).To(MatchError(args.ErrRedditUrlsNotAllowed)) @@ -158,10 +152,8 @@ var _ = Describe("RedditArguments", func() { It("should fail with an invalid URL", func() { redditArgs := &args.RedditArguments{ QueryType: types.RedditScrapeUrls, - URLs: []types.RedditStartURL{ - {URL: "ht tp://invalid-url.com", Method: "GET"}, - }, - Sort: types.RedditSortNew, + URLs: []string{"ht tp://invalid-url.com"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).To(HaveOccurred()) @@ -171,27 +163,23 @@ var _ = Describe("RedditArguments", func() { It("should fail with an invalid domain", func() { redditArgs := &args.RedditArguments{ QueryType: types.RedditScrapeUrls, - URLs: []types.RedditStartURL{ - {URL: "https://www.google.com", Method: "GET"}, - }, - Sort: types.RedditSortNew, + URLs: []string{"https://www.google.com"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("invalid Reddit URL")) }) - It("should fail with an invalid HTTP method", func() { + It("should fail if the URL is not a post or comment", func() { redditArgs := &args.RedditArguments{ QueryType: types.RedditScrapeUrls, - URLs: []types.RedditStartURL{ - {URL: "https://www.reddit.com/r/golang/", Method: "INVALID"}, - }, - Sort: types.RedditSortNew, + URLs: []string{"https://www.reddit.com/r/golang/"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).To(HaveOccurred()) - Expect(err.Error()).To(ContainSubstring("is not a valid HTTP method")) + Expect(err.Error()).To(ContainSubstring("not a Reddit post or comment URL")) }) }) }) From c0ceec1a743cea99599b35571d68ab07478232dc Mon Sep 17 00:00:00 2001 From: mcamou Date: Wed, 27 Aug 2025 14:35:26 +0200 Subject: [PATCH 093/136] Lint --- args/reddit.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/args/reddit.go b/args/reddit.go index 68e73de..c2ac4e5 100644 --- a/args/reddit.go +++ b/args/reddit.go @@ -8,7 +8,6 @@ import ( "strings" "time" - "github.com/masa-finance/tee-types/pkg/util" teetypes "github.com/masa-finance/tee-types/types" ) @@ -34,8 +33,6 @@ const ( const redditDomainSuffix = "reddit.com" -var allowedHttpMethods = util.NewSet("GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS") - // RedditArguments defines args for Reddit scrapes // see https://apify.com/trudax/reddit-scraper type RedditArguments struct { From bcf33772c0184d5dbfbcc3a7227ec70cee3e0f57 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 27 Aug 2025 18:44:07 +0200 Subject: [PATCH 094/136] fix: use aux instead of q --- args/args.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/args/args.go b/args/args.go index d928ca1..ab7abe4 100644 --- a/args/args.go +++ b/args/args.go @@ -20,6 +20,6 @@ func (q *QueryTypeArgument) UnmarshalJSON(data []byte) error { if err := json.Unmarshal(data, aux); err != nil { return fmt.Errorf("failed to unmarshal QueryType arguments: %w", err) } - q.QueryType = strings.ToLower(q.QueryType) + q.QueryType = strings.ToLower(aux.QueryType) return nil } From f5b142f53074b7a2716df7d27a7c4ce38db84cdd Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 4 Sep 2025 23:00:24 +0200 Subject: [PATCH 095/136] feat: adds web and llm types and arguments --- args/llm.go | 96 ++++++++++++++++++++++ args/llm_test.go | 164 ++++++++++++++++++++++++++++++++++++++ args/unmarshaller.go | 58 +++----------- args/unmarshaller_test.go | 19 ++++- args/web.go | 86 ++++++++++++-------- args/web_test.go | 158 ++++++++++++++++++++++++++++++++++++ types/jobs.go | 21 +++-- types/llm.go | 15 ++++ types/web.go | 54 +++++++++++++ 9 files changed, 583 insertions(+), 88 deletions(-) create mode 100644 args/llm.go create mode 100644 args/llm_test.go create mode 100644 args/web_test.go create mode 100644 types/llm.go create mode 100644 types/web.go diff --git a/args/llm.go b/args/llm.go new file mode 100644 index 0000000..297e154 --- /dev/null +++ b/args/llm.go @@ -0,0 +1,96 @@ +package args + +import ( + "encoding/json" + "errors" + "fmt" + + teetypes "github.com/masa-finance/tee-types/types" +) + +var ( + ErrLLMDatasetIdRequired = errors.New("dataset id is required") + ErrLLMPromptRequired = errors.New("prompt is required") + ErrLLMMaxTokensNegative = errors.New("max tokens must be non-negative") +) + +const ( + llmDefaultMaxTokens = 300 + llmDefaultTemperature = "0.1" + llmDefaultMultipleColumns = false + llmDefaultModel = "gemini-1.5-flash-8b" +) + +type LLMProcessorArguments struct { + QueryType string `json:"type"` + DatasetId string `json:"dataset_id"` + Prompt string `json:"prompt"` + MaxTokens int `json:"max_tokens"` + Temperature string `json:"temperature"` +} + +// UnmarshalJSON implements custom JSON unmarshaling with validation +func (l *LLMProcessorArguments) UnmarshalJSON(data []byte) error { + // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) + type Alias LLMProcessorArguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(l), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal web arguments: %w", err) + } + + l.setDefaultValues() + + return l.Validate() +} + +func (l *LLMProcessorArguments) setDefaultValues() { + if l.MaxTokens == 0 { + l.MaxTokens = llmDefaultMaxTokens + } + if l.Temperature == "" { + l.Temperature = llmDefaultTemperature + } +} + +func (l *LLMProcessorArguments) Validate() error { + if l.DatasetId == "" { + return ErrLLMDatasetIdRequired + } + if l.Prompt == "" { + return ErrLLMPromptRequired + } + if l.MaxTokens < 0 { + return fmt.Errorf("%w: got %v", ErrLLMMaxTokensNegative, l.MaxTokens) + } + return nil +} + +func (l *LLMProcessorArguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := l.Validate(); err != nil { + return err + } + + // Validate QueryType against job-specific capabilities + return jobType.ValidateCapability(l.GetCapability()) +} + +// GetCapability returns the capability for web operations (always scraper) +func (l *LLMProcessorArguments) GetCapability() teetypes.Capability { + return teetypes.CapDatasetProcessor +} + +func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequest { + return teetypes.LLMProcessorRequest{ + InputDatasetId: l.DatasetId, + Prompt: l.Prompt, + MaxTokens: l.MaxTokens, + Temperature: l.Temperature, + MultipleColumns: llmDefaultMultipleColumns, + Model: llmDefaultModel, + } +} diff --git a/args/llm_test.go b/args/llm_test.go new file mode 100644 index 0000000..39ab449 --- /dev/null +++ b/args/llm_test.go @@ -0,0 +1,164 @@ +package args_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/types" +) + +var _ = Describe("LLMProcessorArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should set default values", func() { + llmArgs := args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "summarize: ${markdown}", + } + jsonData, err := json.Marshal(llmArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &llmArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(llmArgs.MaxTokens).To(Equal(300)) + Expect(llmArgs.Temperature).To(Equal("0.1")) + }) + + It("should override default values", func() { + llmArgs := args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "summarize: ${markdown}", + MaxTokens: 123, + Temperature: "0.7", + } + jsonData, err := json.Marshal(llmArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &llmArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(llmArgs.MaxTokens).To(Equal(123)) + Expect(llmArgs.Temperature).To(Equal("0.7")) + }) + + It("should fail unmarshal when dataset_id is missing", func() { + var llmArgs args.LLMProcessorArguments + jsonData := []byte(`{"type":"datasetprocessor","prompt":"p"}`) + err := json.Unmarshal(jsonData, &llmArgs) + Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) + }) + + It("should fail unmarshal when prompt is missing", func() { + var llmArgs args.LLMProcessorArguments + jsonData := []byte(`{"type":"datasetprocessor","dataset_id":"ds1"}`) + err := json.Unmarshal(jsonData, &llmArgs) + Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + llmArgs := &args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 10, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when dataset_id is missing", func() { + llmArgs := &args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + Prompt: "p", + MaxTokens: 10, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) + }) + + It("should fail when prompt is missing", func() { + llmArgs := &args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + MaxTokens: 10, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) + }) + + It("should fail when max tokens is negative", func() { + llmArgs := &args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "p", + MaxTokens: -1, + Temperature: "0.2", + } + err := llmArgs.Validate() + Expect(errors.Is(err, args.ErrLLMMaxTokensNegative)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got -1")) + }) + }) + + Describe("Job capability", func() { + It("should return the datasetprocessor capability", func() { + llmArgs := &args.LLMProcessorArguments{} + Expect(llmArgs.GetCapability()).To(Equal(types.CapDatasetProcessor)) + }) + + It("should validate capability for LLMJob", func() { + llmArgs := &args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 1, + Temperature: "0.1", + } + err := llmArgs.ValidateForJobType(types.LLMJob) + Expect(err).ToNot(HaveOccurred()) + }) + }) + + Describe("ToLLMProcessorRequest", func() { + It("should map fields and defaults correctly", func() { + llmArgs := args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 0, // default applied in To* + Temperature: "", + } + req := llmArgs.ToLLMProcessorRequest() + Expect(req.InputDatasetId).To(Equal("ds1")) + Expect(req.Prompt).To(Equal("p")) + Expect(req.MaxTokens).To(Equal(0)) + Expect(req.Temperature).To(Equal("")) + Expect(req.MultipleColumns).To(BeFalse()) + Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) + }) + + It("should map fields correctly when set", func() { + llmArgs := args.LLMProcessorArguments{ + QueryType: "datasetprocessor", + DatasetId: "ds1", + Prompt: "p", + MaxTokens: 42, + Temperature: "0.7", + } + req := llmArgs.ToLLMProcessorRequest() + Expect(req.InputDatasetId).To(Equal("ds1")) + Expect(req.Prompt).To(Equal("p")) + Expect(req.MaxTokens).To(Equal(42)) + Expect(req.Temperature).To(Equal("0.7")) + Expect(req.MultipleColumns).To(BeFalse()) + Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) + }) + }) +}) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index ce6bb49..69013a2 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -10,52 +10,9 @@ import ( // JobArguments defines the interface that all job arguments must implement type JobArguments interface { - Validate() error GetCapability() types.Capability } -// TwitterJobArguments extends JobArguments for Twitter-specific methods -type TwitterJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error - IsSingleTweetOperation() bool - IsMultipleTweetOperation() bool - IsSingleProfileOperation() bool - IsMultipleProfileOperation() bool - IsSingleSpaceOperation() bool - IsTrendsOperation() bool -} - -// WebJobArguments extends JobArguments for Web-specific methods -type WebJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error - IsDeepScrape() bool - HasSelector() bool - GetEffectiveMaxDepth() int -} - -// TikTokJobArguments extends JobArguments for TikTok-specific methods -type TikTokJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error - HasLanguagePreference() bool - GetVideoURL() string - GetLanguageCode() string -} - -// LinkedInJobArguments extends JobArguments for LinkedIn-specific methods -type LinkedInJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error -} - -// RedditJobArguments extends JobArguments for Reddit-specific methods -type RedditJobArguments interface { - JobArguments - ValidateForJobType(jobType types.JobType) error -} - // UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct // This works with both tee-indexer and tee-worker JobArguments types func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { @@ -63,6 +20,9 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum case types.WebJob: return unmarshalWebArguments(args) + case types.LLMJob: + return unmarshalLLMArguments(args) + case types.TiktokJob: return unmarshalTikTokArguments(args) @@ -84,14 +44,22 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum } // Helper functions for unmarshaling specific argument types -func unmarshalWebArguments(args map[string]any) (*WebSearchArguments, error) { - webArgs := &WebSearchArguments{} +func unmarshalWebArguments(args map[string]any) (*WebArguments, error) { + webArgs := &WebArguments{} if err := unmarshalToStruct(args, webArgs); err != nil { return nil, fmt.Errorf("failed to unmarshal web job arguments: %w", err) } return webArgs, nil } +func unmarshalLLMArguments(args map[string]any) (*LLMProcessorArguments, error) { + llmArgs := &LLMProcessorArguments{} + if err := unmarshalToStruct(args, llmArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal LLM job arguments: %w", err) + } + return llmArgs, nil +} + func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { // Unmarshal minimally to read QueryType like we do for Twitter minimal := &QueryTypeArgument{} diff --git a/args/unmarshaller_test.go b/args/unmarshaller_test.go index 04e784f..03f14c4 100644 --- a/args/unmarshaller_test.go +++ b/args/unmarshaller_test.go @@ -14,19 +14,32 @@ var _ = Describe("Unmarshaller", func() { It("should unmarshal the arguments correctly", func() { argsMap := map[string]any{ "url": "https://example.com", - "selector": "h1", "max_depth": 2, } jobArgs, err := args.UnmarshalJobArguments(types.WebJob, argsMap) Expect(err).ToNot(HaveOccurred()) - webArgs, ok := jobArgs.(*args.WebSearchArguments) + webArgs, ok := jobArgs.(*args.WebArguments) Expect(ok).To(BeTrue()) Expect(webArgs.URL).To(Equal("https://example.com")) - Expect(webArgs.Selector).To(Equal("h1")) Expect(webArgs.MaxDepth).To(Equal(2)) }) }) + Context("with a LLMJob", func() { + It("should unmarshal the arguments correctly", func() { + argsMap := map[string]any{ + "dataset_id": "123", + "prompt": "summarize the content of this webpage: ${markdown}", + } + jobArgs, err := args.UnmarshalJobArguments(types.LLMJob, argsMap) + Expect(err).ToNot(HaveOccurred()) + llmArgs, ok := jobArgs.(*args.LLMProcessorArguments) + Expect(ok).To(BeTrue()) + Expect(llmArgs.DatasetId).To(Equal("123")) + Expect(llmArgs.Prompt).To(Equal("summarize the content of this webpage: ${markdown}")) + }) + }) + Context("with a TiktokJob", func() { It("should unmarshal the arguments correctly", func() { argsMap := map[string]any{ diff --git a/args/web.go b/args/web.go index 33a466d..9f6589b 100644 --- a/args/web.go +++ b/args/web.go @@ -2,24 +2,39 @@ package args import ( "encoding/json" + "errors" "fmt" "net/url" - "github.com/masa-finance/tee-types/pkg/util" teetypes "github.com/masa-finance/tee-types/types" ) -type WebSearchArguments struct { - URL string `json:"url"` - Selector string `json:"selector"` - Depth int `json:"depth"` - MaxDepth int `json:"max_depth"` +var ( + ErrWebURLRequired = errors.New("url is required") + ErrWebURLInvalid = errors.New("invalid URL format") + ErrWebURLSchemeMissing = errors.New("url must include a scheme (http:// or https://)") + ErrWebMaxDepth = errors.New("max depth must be non-negative") + ErrWebMaxPages = errors.New("max pages must be at least 1") +) + +const ( + webDefaultMaxPages = 1 + webDefaultMethod = "GET" + webDefaultRespectRobotsTxtFile = false + webDefaultSaveMarkdown = true +) + +type WebArguments struct { + QueryType teetypes.WebQueryType `json:"type"` + URL string `json:"url"` + MaxDepth int `json:"max_depth"` + MaxPages int `json:"max_pages"` } // UnmarshalJSON implements custom JSON unmarshaling with validation -func (w *WebSearchArguments) UnmarshalJSON(data []byte) error { +func (w *WebArguments) UnmarshalJSON(data []byte) error { // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias WebSearchArguments + type Alias WebArguments aux := &struct { *Alias }{ @@ -27,46 +42,50 @@ func (w *WebSearchArguments) UnmarshalJSON(data []byte) error { } if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal Web arguments: %w", err) + return fmt.Errorf("failed to unmarshal web arguments: %w", err) } + w.setDefaultValues() + return w.Validate() } +func (w *WebArguments) setDefaultValues() { + if w.MaxPages == 0 { + w.MaxPages = webDefaultMaxPages + } +} + // Validate validates the Web arguments -func (w *WebSearchArguments) Validate() error { +func (w *WebArguments) Validate() error { if w.URL == "" { - return fmt.Errorf("url is required") + return ErrWebURLRequired } // Validate URL format parsedURL, err := url.Parse(w.URL) if err != nil { - return fmt.Errorf("invalid URL format: %w", err) + return fmt.Errorf("%w: %v", ErrWebURLInvalid, err) } // Ensure URL has a scheme if parsedURL.Scheme == "" { - return fmt.Errorf("URL must include a scheme (http:// or https://)") + return ErrWebURLSchemeMissing } if w.MaxDepth < 0 { - return fmt.Errorf("max_depth must be non-negative, got: %d", w.MaxDepth) + return fmt.Errorf("%w: got %v", ErrWebMaxDepth, w.MaxDepth) } - if w.Depth < 0 { - return fmt.Errorf("depth must be non-negative, got: %d", w.Depth) - } - - if w.Depth > w.MaxDepth && w.MaxDepth > 0 { - return fmt.Errorf("depth (%d) cannot exceed max_depth (%d)", w.Depth, w.MaxDepth) + if w.MaxPages < 1 { + return fmt.Errorf("%w: got %v", ErrWebMaxPages, w.MaxPages) } return nil } // ValidateForJobType validates Web arguments for a specific job type -func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error { +func (w *WebArguments) ValidateForJobType(jobType teetypes.JobType) error { if err := w.Validate(); err != nil { return err } @@ -76,21 +95,18 @@ func (w *WebSearchArguments) ValidateForJobType(jobType teetypes.JobType) error } // GetCapability returns the capability for web operations (always scraper) -func (w *WebSearchArguments) GetCapability() teetypes.Capability { +func (w *WebArguments) GetCapability() teetypes.Capability { return teetypes.CapScraper } -// IsDeepScrape returns true if this is a deep scraping operation -func (w *WebSearchArguments) IsDeepScrape() bool { - return w.MaxDepth > 1 || w.Depth > 0 -} - -// HasSelector returns true if a CSS selector is specified -func (w *WebSearchArguments) HasSelector() bool { - return w.Selector != "" -} - -// GetEffectiveMaxDepth returns the effective maximum depth for scraping -func (w *WebSearchArguments) GetEffectiveMaxDepth() int { - return util.Max(w.MaxDepth, 1) +func (w WebArguments) ToWebScraperRequest() teetypes.WebScraperRequest { + return teetypes.WebScraperRequest{ + StartUrls: []teetypes.WebStartURL{ + {URL: w.URL, Method: webDefaultMethod}, + }, + MaxCrawlDepth: w.MaxDepth, + MaxCrawlPages: w.MaxPages, + RespectRobotsTxtFile: webDefaultRespectRobotsTxtFile, + SaveMarkdown: webDefaultSaveMarkdown, + } } diff --git a/args/web_test.go b/args/web_test.go new file mode 100644 index 0000000..77e771f --- /dev/null +++ b/args/web_test.go @@ -0,0 +1,158 @@ +package args_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/types" +) + +var _ = Describe("WebArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should set default values", func() { + webArgs := args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 0, + } + jsonData, err := json.Marshal(webArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &webArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(webArgs.MaxPages).To(Equal(1)) + }) + + It("should override default values", func() { + webArgs := args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 2, + MaxPages: 5, + } + jsonData, err := json.Marshal(webArgs) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &webArgs) + Expect(err).ToNot(HaveOccurred()) + Expect(webArgs.MaxPages).To(Equal(5)) + }) + + It("should fail unmarshal when url is missing", func() { + var webArgs args.WebArguments + jsonData := []byte(`{"type":"scraper","max_depth":1,"max_pages":1}`) + err := json.Unmarshal(jsonData, &webArgs) + Expect(errors.Is(err, args.ErrWebURLRequired)).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 2, + MaxPages: 3, + } + err := webArgs.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when url is missing", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + MaxDepth: 0, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebURLRequired)).To(BeTrue()) + }) + + It("should fail with an invalid URL format", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "http:// invalid.com", + MaxDepth: 0, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebURLInvalid)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("invalid URL format")) + }) + + It("should fail when scheme is missing", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "example.com", + MaxDepth: 0, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebURLSchemeMissing)).To(BeTrue()) + }) + + It("should fail when max depth is negative", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: -1, + MaxPages: 1, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebMaxDepth)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got -1")) + }) + + It("should fail when max pages is less than 1", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 0, + MaxPages: 0, + } + err := webArgs.Validate() + Expect(errors.Is(err, args.ErrWebMaxPages)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got 0")) + }) + }) + + Describe("Job capability", func() { + It("should return the scraper capability", func() { + webArgs := &args.WebArguments{} + Expect(webArgs.GetCapability()).To(Equal(types.CapScraper)) + }) + + It("should validate capability for WebJob", func() { + webArgs := &args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 1, + MaxPages: 1, + } + err := webArgs.ValidateForJobType(types.WebJob) + Expect(err).ToNot(HaveOccurred()) + }) + }) + + Describe("ToWebScraperRequest", func() { + It("should map fields correctly", func() { + webArgs := args.WebArguments{ + QueryType: types.WebScraper, + URL: "https://example.com", + MaxDepth: 2, + MaxPages: 3, + } + req := webArgs.ToWebScraperRequest() + Expect(req.StartUrls).To(HaveLen(1)) + Expect(req.StartUrls[0].URL).To(Equal("https://example.com")) + Expect(req.StartUrls[0].Method).To(Equal("GET")) + Expect(req.MaxCrawlDepth).To(Equal(2)) + Expect(req.MaxCrawlPages).To(Equal(3)) + Expect(req.RespectRobotsTxtFile).To(BeFalse()) + Expect(req.SaveMarkdown).To(BeTrue()) + }) + }) +}) diff --git a/types/jobs.go b/types/jobs.go index fe66292..8619cd0 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -24,7 +24,7 @@ func (j JobType) ValidateCapability(capability Capability) error { } if !slices.Contains(validCaps, capability) { - return fmt.Errorf("capability '%s' is not valid for job type '%s'. Valid capabilities: %v", + return fmt.Errorf("capability '%s' is not valid for job type '%s'. valid capabilities: %v", capability, j, validCaps) } @@ -51,6 +51,7 @@ const ( TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify LinkedInJob JobType = "linkedin" // LinkedIn scraping, keeping for unmarshalling logic RedditJob JobType = "reddit" // Reddit scraping with Apify + LLMJob JobType = "llm" // LLM processing ) // Capability constants - typed to prevent typos and enable discoverability @@ -74,26 +75,26 @@ const ( CapGetFollowing Capability = "getfollowing" CapGetFollowers Capability = "getfollowers" CapGetSpace Capability = "getspace" - CapGetProfile Capability = "getprofile" // LinkedIn get profile capability + CapGetProfile Capability = "getprofile" // Reddit capabilities CapScrapeUrls Capability = "scrapeurls" CapSearchPosts Capability = "searchposts" CapSearchUsers Capability = "searchusers" CapSearchCommunities Capability = "searchcommunities" + // LLM capabilities + CapDatasetProcessor Capability = "datasetprocessor" CapEmpty Capability = "" ) // Capability group constants for easy reuse var ( - AlwaysAvailableWebCaps = []Capability{CapScraper, CapEmpty} AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ - WebJob: AlwaysAvailableWebCaps, TelemetryJob: AlwaysAvailableTelemetryCaps, TiktokJob: AlwaysAvailableTiktokCaps, } @@ -118,6 +119,12 @@ var ( // RedditCaps are all the Reddit capabilities (only available with Apify) RedditCaps = []Capability{CapScrapeUrls, CapSearchPosts, CapSearchUsers, CapSearchCommunities} + + // WebCaps are all the Web capabilities (only available with Apify) + WebCaps = []Capability{CapScraper, CapEmpty} + + // LLMCaps are all the LLM capabilities (only available with Apify) + LLMCaps = []Capability{CapDatasetProcessor, CapEmpty} ) // JobCapabilityMap defines which capabilities are valid for each job type @@ -137,7 +144,10 @@ var JobCapabilityMap = map[JobType][]Capability{ TwitterApifyJob: TwitterApifyCaps, // Web job capabilities - WebJob: AlwaysAvailableWebCaps, + WebJob: WebCaps, + + // LLM job capabilities + LLMJob: LLMCaps, // TikTok job capabilities TiktokJob: combineCapabilities( @@ -159,6 +169,7 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, WebJob: CapScraper, + LLMJob: CapDatasetProcessor, TiktokJob: CapTranscription, RedditJob: CapScrapeUrls, TelemetryJob: CapTelemetry, diff --git a/types/llm.go b/types/llm.go new file mode 100644 index 0000000..fb67693 --- /dev/null +++ b/types/llm.go @@ -0,0 +1,15 @@ +package types + +type LLMProcessorRequest struct { + InputDatasetId string `json:"inputDatasetId"` + LLMProviderApiKey string `json:"llmProviderApiKey"` // encrypted api key by miner + Model string `json:"model"` + MultipleColumns bool `json:"multipleColumns"` + Prompt string `json:"prompt"` // example: summarize the content of this webpage: ${markdown} + Temperature string `json:"temperature"` + MaxTokens int `json:"maxTokens"` +} + +type LLMProcessorResult struct { + LLMResponse string `json:"llmresponse"` +} diff --git a/types/web.go b/types/web.go new file mode 100644 index 0000000..85f0d4d --- /dev/null +++ b/types/web.go @@ -0,0 +1,54 @@ +package types + +import ( + "time" +) + +// WebStartURL represents a single start URL configuration for web scraping +type WebStartURL struct { + URL string `json:"url"` + Method string `json:"method"` +} + +type WebQueryType string + +const ( + WebScraper WebQueryType = "scraper" +) + +// WebScraperRequest represents the customizable configuration for web scraping operations +type WebScraperRequest struct { + StartUrls []WebStartURL `json:"startUrls"` + MaxCrawlDepth int `json:"maxCrawlDepth"` + MaxCrawlPages int `json:"maxCrawlPages"` + RespectRobotsTxtFile bool `json:"respectRobotsTxtFile"` + SaveMarkdown bool `json:"saveMarkdown"` +} + +// WebCrawlInfo contains information about the crawling process +type WebCrawlInfo struct { + LoadedURL string `json:"loadedUrl"` + LoadedTime time.Time `json:"loadedTime"` + ReferrerURL string `json:"referrerUrl"` + Depth int `json:"depth"` + HTTPStatusCode int `json:"httpStatusCode"` +} + +// WebMetadata contains metadata extracted from the scraped page +type WebMetadata struct { + CanonicalURL string `json:"canonicalUrl"` + Title string `json:"title"` + Description *string `json:"description"` + Author *string `json:"author"` + Keywords *string `json:"keywords"` + LanguageCode *string `json:"languageCode"` +} + +// WebScraperResult represents the complete result from web scraping a single page +type WebScraperResult struct { + URL string `json:"url"` + Crawl WebCrawlInfo `json:"crawl"` + Metadata WebMetadata `json:"metadata"` + Text string `json:"text"` + Markdown string `json:"markdown"` +} From 79aa76bbab06e18b892c14d3e3b940da06f04979 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Fri, 5 Sep 2025 20:42:13 +0200 Subject: [PATCH 096/136] chore: cleanup comments --- args/llm.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/args/llm.go b/args/llm.go index 297e154..fe67e07 100644 --- a/args/llm.go +++ b/args/llm.go @@ -79,7 +79,7 @@ func (l *LLMProcessorArguments) ValidateForJobType(jobType teetypes.JobType) err return jobType.ValidateCapability(l.GetCapability()) } -// GetCapability returns the capability for web operations (always scraper) +// GetCapability returns the capability for llm operations (always datasetprocessor currently) func (l *LLMProcessorArguments) GetCapability() teetypes.Capability { return teetypes.CapDatasetProcessor } @@ -90,7 +90,7 @@ func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequ Prompt: l.Prompt, MaxTokens: l.MaxTokens, Temperature: l.Temperature, - MultipleColumns: llmDefaultMultipleColumns, - Model: llmDefaultModel, + MultipleColumns: llmDefaultMultipleColumns, // overrides default in actor API + Model: llmDefaultModel, // overrides default in actor API } } From 5d97ecf581673469b5c9736255be9596dac1a113 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 20:54:11 +0200 Subject: [PATCH 097/136] fix: export LLM defaults --- args/llm.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/args/llm.go b/args/llm.go index fe67e07..93d2aa6 100644 --- a/args/llm.go +++ b/args/llm.go @@ -15,10 +15,10 @@ var ( ) const ( - llmDefaultMaxTokens = 300 - llmDefaultTemperature = "0.1" - llmDefaultMultipleColumns = false - llmDefaultModel = "gemini-1.5-flash-8b" + LLMDefaultMaxTokens = 300 + LLMDefaultTemperature = "0.1" + LLMDefaultMultipleColumns = false + LLMDefaultModel = "gemini-1.5-flash-8b" ) type LLMProcessorArguments struct { @@ -50,10 +50,10 @@ func (l *LLMProcessorArguments) UnmarshalJSON(data []byte) error { func (l *LLMProcessorArguments) setDefaultValues() { if l.MaxTokens == 0 { - l.MaxTokens = llmDefaultMaxTokens + l.MaxTokens = LLMDefaultMaxTokens } if l.Temperature == "" { - l.Temperature = llmDefaultTemperature + l.Temperature = LLMDefaultTemperature } } @@ -85,12 +85,13 @@ func (l *LLMProcessorArguments) GetCapability() teetypes.Capability { } func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequest { + return teetypes.LLMProcessorRequest{ InputDatasetId: l.DatasetId, Prompt: l.Prompt, MaxTokens: l.MaxTokens, Temperature: l.Temperature, - MultipleColumns: llmDefaultMultipleColumns, // overrides default in actor API - Model: llmDefaultModel, // overrides default in actor API + MultipleColumns: LLMDefaultMultipleColumns, // overrides default in actor API + Model: LLMDefaultModel, // overrides default in actor API } } From 70e19b68717ceccb593fb41d325858ea44038808 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 20:54:44 +0200 Subject: [PATCH 098/136] fix: export web defaults --- args/web.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/args/web.go b/args/web.go index 9f6589b..561aa59 100644 --- a/args/web.go +++ b/args/web.go @@ -18,10 +18,10 @@ var ( ) const ( - webDefaultMaxPages = 1 - webDefaultMethod = "GET" - webDefaultRespectRobotsTxtFile = false - webDefaultSaveMarkdown = true + WebDefaultMaxPages = 1 + WebDefaultMethod = "GET" + WebDefaultRespectRobotsTxtFile = false + WebDefaultSaveMarkdown = true ) type WebArguments struct { @@ -52,7 +52,7 @@ func (w *WebArguments) UnmarshalJSON(data []byte) error { func (w *WebArguments) setDefaultValues() { if w.MaxPages == 0 { - w.MaxPages = webDefaultMaxPages + w.MaxPages = WebDefaultMaxPages } } @@ -102,11 +102,11 @@ func (w *WebArguments) GetCapability() teetypes.Capability { func (w WebArguments) ToWebScraperRequest() teetypes.WebScraperRequest { return teetypes.WebScraperRequest{ StartUrls: []teetypes.WebStartURL{ - {URL: w.URL, Method: webDefaultMethod}, + {URL: w.URL, Method: WebDefaultMethod}, }, MaxCrawlDepth: w.MaxDepth, MaxCrawlPages: w.MaxPages, - RespectRobotsTxtFile: webDefaultRespectRobotsTxtFile, - SaveMarkdown: webDefaultSaveMarkdown, + RespectRobotsTxtFile: WebDefaultRespectRobotsTxtFile, + SaveMarkdown: WebDefaultSaveMarkdown, } } From 898e896b17a41fc68a00aaedf9950e0414cbb1e7 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 21:49:49 +0200 Subject: [PATCH 099/136] fix: remove llm as a capability - it is not --- args/llm.go | 15 --------------- args/llm_test.go | 28 ---------------------------- args/unmarshaller.go | 3 --- args/unmarshaller_test.go | 15 --------------- types/jobs.go | 10 ---------- 5 files changed, 71 deletions(-) diff --git a/args/llm.go b/args/llm.go index 93d2aa6..387d92f 100644 --- a/args/llm.go +++ b/args/llm.go @@ -22,7 +22,6 @@ const ( ) type LLMProcessorArguments struct { - QueryType string `json:"type"` DatasetId string `json:"dataset_id"` Prompt string `json:"prompt"` MaxTokens int `json:"max_tokens"` @@ -70,20 +69,6 @@ func (l *LLMProcessorArguments) Validate() error { return nil } -func (l *LLMProcessorArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := l.Validate(); err != nil { - return err - } - - // Validate QueryType against job-specific capabilities - return jobType.ValidateCapability(l.GetCapability()) -} - -// GetCapability returns the capability for llm operations (always datasetprocessor currently) -func (l *LLMProcessorArguments) GetCapability() teetypes.Capability { - return teetypes.CapDatasetProcessor -} - func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequest { return teetypes.LLMProcessorRequest{ diff --git a/args/llm_test.go b/args/llm_test.go index 39ab449..3884ebf 100644 --- a/args/llm_test.go +++ b/args/llm_test.go @@ -8,14 +8,12 @@ import ( . "github.com/onsi/gomega" "github.com/masa-finance/tee-types/args" - "github.com/masa-finance/tee-types/types" ) var _ = Describe("LLMProcessorArguments", func() { Describe("Marshalling and unmarshalling", func() { It("should set default values", func() { llmArgs := args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", Prompt: "summarize: ${markdown}", } @@ -29,7 +27,6 @@ var _ = Describe("LLMProcessorArguments", func() { It("should override default values", func() { llmArgs := args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", Prompt: "summarize: ${markdown}", MaxTokens: 123, @@ -61,7 +58,6 @@ var _ = Describe("LLMProcessorArguments", func() { Describe("Validation", func() { It("should succeed with valid arguments", func() { llmArgs := &args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", Prompt: "p", MaxTokens: 10, @@ -73,7 +69,6 @@ var _ = Describe("LLMProcessorArguments", func() { It("should fail when dataset_id is missing", func() { llmArgs := &args.LLMProcessorArguments{ - QueryType: "datasetprocessor", Prompt: "p", MaxTokens: 10, Temperature: "0.2", @@ -84,7 +79,6 @@ var _ = Describe("LLMProcessorArguments", func() { It("should fail when prompt is missing", func() { llmArgs := &args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", MaxTokens: 10, Temperature: "0.2", @@ -95,7 +89,6 @@ var _ = Describe("LLMProcessorArguments", func() { It("should fail when max tokens is negative", func() { llmArgs := &args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", Prompt: "p", MaxTokens: -1, @@ -107,29 +100,9 @@ var _ = Describe("LLMProcessorArguments", func() { }) }) - Describe("Job capability", func() { - It("should return the datasetprocessor capability", func() { - llmArgs := &args.LLMProcessorArguments{} - Expect(llmArgs.GetCapability()).To(Equal(types.CapDatasetProcessor)) - }) - - It("should validate capability for LLMJob", func() { - llmArgs := &args.LLMProcessorArguments{ - QueryType: "datasetprocessor", - DatasetId: "ds1", - Prompt: "p", - MaxTokens: 1, - Temperature: "0.1", - } - err := llmArgs.ValidateForJobType(types.LLMJob) - Expect(err).ToNot(HaveOccurred()) - }) - }) - Describe("ToLLMProcessorRequest", func() { It("should map fields and defaults correctly", func() { llmArgs := args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", Prompt: "p", MaxTokens: 0, // default applied in To* @@ -146,7 +119,6 @@ var _ = Describe("LLMProcessorArguments", func() { It("should map fields correctly when set", func() { llmArgs := args.LLMProcessorArguments{ - QueryType: "datasetprocessor", DatasetId: "ds1", Prompt: "p", MaxTokens: 42, diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 69013a2..f0cbe1f 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -20,9 +20,6 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum case types.WebJob: return unmarshalWebArguments(args) - case types.LLMJob: - return unmarshalLLMArguments(args) - case types.TiktokJob: return unmarshalTikTokArguments(args) diff --git a/args/unmarshaller_test.go b/args/unmarshaller_test.go index 03f14c4..4231cbd 100644 --- a/args/unmarshaller_test.go +++ b/args/unmarshaller_test.go @@ -25,21 +25,6 @@ var _ = Describe("Unmarshaller", func() { }) }) - Context("with a LLMJob", func() { - It("should unmarshal the arguments correctly", func() { - argsMap := map[string]any{ - "dataset_id": "123", - "prompt": "summarize the content of this webpage: ${markdown}", - } - jobArgs, err := args.UnmarshalJobArguments(types.LLMJob, argsMap) - Expect(err).ToNot(HaveOccurred()) - llmArgs, ok := jobArgs.(*args.LLMProcessorArguments) - Expect(ok).To(BeTrue()) - Expect(llmArgs.DatasetId).To(Equal("123")) - Expect(llmArgs.Prompt).To(Equal("summarize the content of this webpage: ${markdown}")) - }) - }) - Context("with a TiktokJob", func() { It("should unmarshal the arguments correctly", func() { argsMap := map[string]any{ diff --git a/types/jobs.go b/types/jobs.go index 8619cd0..1a7d46b 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -51,7 +51,6 @@ const ( TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify LinkedInJob JobType = "linkedin" // LinkedIn scraping, keeping for unmarshalling logic RedditJob JobType = "reddit" // Reddit scraping with Apify - LLMJob JobType = "llm" // LLM processing ) // Capability constants - typed to prevent typos and enable discoverability @@ -81,8 +80,6 @@ const ( CapSearchPosts Capability = "searchposts" CapSearchUsers Capability = "searchusers" CapSearchCommunities Capability = "searchcommunities" - // LLM capabilities - CapDatasetProcessor Capability = "datasetprocessor" CapEmpty Capability = "" ) @@ -122,9 +119,6 @@ var ( // WebCaps are all the Web capabilities (only available with Apify) WebCaps = []Capability{CapScraper, CapEmpty} - - // LLMCaps are all the LLM capabilities (only available with Apify) - LLMCaps = []Capability{CapDatasetProcessor, CapEmpty} ) // JobCapabilityMap defines which capabilities are valid for each job type @@ -146,9 +140,6 @@ var JobCapabilityMap = map[JobType][]Capability{ // Web job capabilities WebJob: WebCaps, - // LLM job capabilities - LLMJob: LLMCaps, - // TikTok job capabilities TiktokJob: combineCapabilities( AlwaysAvailableTiktokCaps, @@ -169,7 +160,6 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TwitterApiJob: CapSearchByQuery, TwitterApifyJob: CapGetFollowers, WebJob: CapScraper, - LLMJob: CapDatasetProcessor, TiktokJob: CapTranscription, RedditJob: CapScrapeUrls, TelemetryJob: CapTelemetry, From 1a8ae3d979e86df3e37c04b9755feb5f1e2362ee Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 22:18:03 +0200 Subject: [PATCH 100/136] fix: remove unused function --- args/unmarshaller.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/args/unmarshaller.go b/args/unmarshaller.go index f0cbe1f..1d3c26d 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -49,14 +49,6 @@ func unmarshalWebArguments(args map[string]any) (*WebArguments, error) { return webArgs, nil } -func unmarshalLLMArguments(args map[string]any) (*LLMProcessorArguments, error) { - llmArgs := &LLMProcessorArguments{} - if err := unmarshalToStruct(args, llmArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal LLM job arguments: %w", err) - } - return llmArgs, nil -} - func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { // Unmarshal minimally to read QueryType like we do for Twitter minimal := &QueryTypeArgument{} From 4de7bc7ab1918619ff1c2c668c2032643910eafb Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 9 Sep 2025 22:30:48 +0200 Subject: [PATCH 101/136] fix: add llm response to web --- types/web.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/types/web.go b/types/web.go index 85f0d4d..dda1cea 100644 --- a/types/web.go +++ b/types/web.go @@ -46,9 +46,10 @@ type WebMetadata struct { // WebScraperResult represents the complete result from web scraping a single page type WebScraperResult struct { - URL string `json:"url"` - Crawl WebCrawlInfo `json:"crawl"` - Metadata WebMetadata `json:"metadata"` - Text string `json:"text"` - Markdown string `json:"markdown"` + URL string `json:"url"` + Crawl WebCrawlInfo `json:"crawl"` + Metadata WebMetadata `json:"metadata"` + Text string `json:"text"` + Markdown string `json:"markdown"` + LLMResponse string `json:"llmresponse,omitempty"` // populated by LLM processor } From 3452fdc6c5bd4c3a2649111031b44b64bdf42011 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 11 Sep 2025 18:27:02 +0200 Subject: [PATCH 102/136] fix: copilot suggestions --- args/llm.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/args/llm.go b/args/llm.go index 387d92f..1816094 100644 --- a/args/llm.go +++ b/args/llm.go @@ -39,7 +39,7 @@ func (l *LLMProcessorArguments) UnmarshalJSON(data []byte) error { } if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal web arguments: %w", err) + return fmt.Errorf("failed to unmarshal llm arguments: %w", err) } l.setDefaultValues() @@ -70,7 +70,6 @@ func (l *LLMProcessorArguments) Validate() error { } func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequest { - return teetypes.LLMProcessorRequest{ InputDatasetId: l.DatasetId, Prompt: l.Prompt, From 550b3ceee4809f452cb48fa9f3b98fa0ec0ee2a4 Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Wed, 17 Sep 2025 10:09:10 -0700 Subject: [PATCH 103/136] fix: add items to llm args (#25) * fix: add max pages to llm args * chore: fix test * fix: rename to items * chore: fix test * fix: rename vars * chore: relevant llm args to uint instead of int * chore: temperature as a float * chore: fix test * chore: fix llm test * fix: llm test * fix: temperature test --- args/llm.go | 32 ++++++++++++++-------------- args/llm_test.go | 54 ++++++++++++++---------------------------------- types/llm.go | 6 +++--- 3 files changed, 35 insertions(+), 57 deletions(-) diff --git a/args/llm.go b/args/llm.go index 1816094..2f5c8c6 100644 --- a/args/llm.go +++ b/args/llm.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "strconv" teetypes "github.com/masa-finance/tee-types/types" ) @@ -11,21 +12,22 @@ import ( var ( ErrLLMDatasetIdRequired = errors.New("dataset id is required") ErrLLMPromptRequired = errors.New("prompt is required") - ErrLLMMaxTokensNegative = errors.New("max tokens must be non-negative") ) const ( - LLMDefaultMaxTokens = 300 - LLMDefaultTemperature = "0.1" - LLMDefaultMultipleColumns = false - LLMDefaultModel = "gemini-1.5-flash-8b" + LLMDefaultMaxTokens uint = 300 + LLMDefaultTemperature float64 = 0.1 + LLMDefaultMultipleColumns bool = false + LLMDefaultModel string = "gemini-1.5-flash-8b" + LLMDefaultItems uint = 1 ) type LLMProcessorArguments struct { - DatasetId string `json:"dataset_id"` - Prompt string `json:"prompt"` - MaxTokens int `json:"max_tokens"` - Temperature string `json:"temperature"` + DatasetId string `json:"dataset_id"` + Prompt string `json:"prompt"` + MaxTokens uint `json:"max_tokens"` + Temperature float64 `json:"temperature"` + Items uint `json:"items"` } // UnmarshalJSON implements custom JSON unmarshaling with validation @@ -48,11 +50,14 @@ func (l *LLMProcessorArguments) UnmarshalJSON(data []byte) error { } func (l *LLMProcessorArguments) setDefaultValues() { + if l.Temperature == 0 { + l.Temperature = LLMDefaultTemperature + } if l.MaxTokens == 0 { l.MaxTokens = LLMDefaultMaxTokens } - if l.Temperature == "" { - l.Temperature = LLMDefaultTemperature + if l.Items == 0 { + l.Items = LLMDefaultItems } } @@ -63,9 +68,6 @@ func (l *LLMProcessorArguments) Validate() error { if l.Prompt == "" { return ErrLLMPromptRequired } - if l.MaxTokens < 0 { - return fmt.Errorf("%w: got %v", ErrLLMMaxTokensNegative, l.MaxTokens) - } return nil } @@ -74,7 +76,7 @@ func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequ InputDatasetId: l.DatasetId, Prompt: l.Prompt, MaxTokens: l.MaxTokens, - Temperature: l.Temperature, + Temperature: strconv.FormatFloat(l.Temperature, 'f', -1, 64), MultipleColumns: LLMDefaultMultipleColumns, // overrides default in actor API Model: LLMDefaultModel, // overrides default in actor API } diff --git a/args/llm_test.go b/args/llm_test.go index 3884ebf..aa35128 100644 --- a/args/llm_test.go +++ b/args/llm_test.go @@ -21,8 +21,9 @@ var _ = Describe("LLMProcessorArguments", func() { Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &llmArgs) Expect(err).ToNot(HaveOccurred()) - Expect(llmArgs.MaxTokens).To(Equal(300)) - Expect(llmArgs.Temperature).To(Equal("0.1")) + Expect(llmArgs.Temperature).To(Equal(0.1)) + Expect(llmArgs.MaxTokens).To(Equal(uint(300))) + Expect(llmArgs.Items).To(Equal(uint(1))) }) It("should override default values", func() { @@ -30,14 +31,16 @@ var _ = Describe("LLMProcessorArguments", func() { DatasetId: "ds1", Prompt: "summarize: ${markdown}", MaxTokens: 123, - Temperature: "0.7", + Temperature: 0.7, + Items: 3, } jsonData, err := json.Marshal(llmArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &llmArgs) Expect(err).ToNot(HaveOccurred()) - Expect(llmArgs.MaxTokens).To(Equal(123)) - Expect(llmArgs.Temperature).To(Equal("0.7")) + Expect(llmArgs.Temperature).To(Equal(0.7)) + Expect(llmArgs.MaxTokens).To(Equal(uint(123))) + Expect(llmArgs.Items).To(Equal(uint(3))) }) It("should fail unmarshal when dataset_id is missing", func() { @@ -61,7 +64,8 @@ var _ = Describe("LLMProcessorArguments", func() { DatasetId: "ds1", Prompt: "p", MaxTokens: 10, - Temperature: "0.2", + Temperature: 0.2, + Items: 1, } err := llmArgs.Validate() Expect(err).ToNot(HaveOccurred()) @@ -71,7 +75,7 @@ var _ = Describe("LLMProcessorArguments", func() { llmArgs := &args.LLMProcessorArguments{ Prompt: "p", MaxTokens: 10, - Temperature: "0.2", + Temperature: 0.2, } err := llmArgs.Validate() Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) @@ -81,53 +85,25 @@ var _ = Describe("LLMProcessorArguments", func() { llmArgs := &args.LLMProcessorArguments{ DatasetId: "ds1", MaxTokens: 10, - Temperature: "0.2", + Temperature: 0.2, } err := llmArgs.Validate() Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) }) - - It("should fail when max tokens is negative", func() { - llmArgs := &args.LLMProcessorArguments{ - DatasetId: "ds1", - Prompt: "p", - MaxTokens: -1, - Temperature: "0.2", - } - err := llmArgs.Validate() - Expect(errors.Is(err, args.ErrLLMMaxTokensNegative)).To(BeTrue()) - Expect(err.Error()).To(ContainSubstring("got -1")) - }) }) Describe("ToLLMProcessorRequest", func() { - It("should map fields and defaults correctly", func() { - llmArgs := args.LLMProcessorArguments{ - DatasetId: "ds1", - Prompt: "p", - MaxTokens: 0, // default applied in To* - Temperature: "", - } - req := llmArgs.ToLLMProcessorRequest() - Expect(req.InputDatasetId).To(Equal("ds1")) - Expect(req.Prompt).To(Equal("p")) - Expect(req.MaxTokens).To(Equal(0)) - Expect(req.Temperature).To(Equal("")) - Expect(req.MultipleColumns).To(BeFalse()) - Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) - }) - - It("should map fields correctly when set", func() { + It("should map request fields to actor request fields", func() { llmArgs := args.LLMProcessorArguments{ DatasetId: "ds1", Prompt: "p", MaxTokens: 42, - Temperature: "0.7", + Temperature: 0.7, } req := llmArgs.ToLLMProcessorRequest() Expect(req.InputDatasetId).To(Equal("ds1")) Expect(req.Prompt).To(Equal("p")) - Expect(req.MaxTokens).To(Equal(42)) + Expect(req.MaxTokens).To(Equal(uint(42))) Expect(req.Temperature).To(Equal("0.7")) Expect(req.MultipleColumns).To(BeFalse()) Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) diff --git a/types/llm.go b/types/llm.go index fb67693..ca99075 100644 --- a/types/llm.go +++ b/types/llm.go @@ -5,9 +5,9 @@ type LLMProcessorRequest struct { LLMProviderApiKey string `json:"llmProviderApiKey"` // encrypted api key by miner Model string `json:"model"` MultipleColumns bool `json:"multipleColumns"` - Prompt string `json:"prompt"` // example: summarize the content of this webpage: ${markdown} - Temperature string `json:"temperature"` - MaxTokens int `json:"maxTokens"` + Prompt string `json:"prompt"` // example: summarize the content of this webpage: ${markdown} + Temperature string `json:"temperature"` // the actor expects a string + MaxTokens uint `json:"maxTokens"` } type LLMProcessorResult struct { From deaef62468fb3f334e7b305f08155263dd76581b Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Tue, 23 Sep 2025 08:26:46 -0700 Subject: [PATCH 104/136] fix: twitter-guard-and-job-definitions (#26) * chore: add job arguments and job struct to types to prep for acceptance tests not needing ego imports * chore: add count guard on twitter args * chore: adds guards * chore: adds var for max results * fix: error message * fix: allow unmarshalling on job types * chore: adds unmarshal to job args --- args/twitter.go | 24 ++++++++++++++++++++---- types/jobs.go | 23 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/args/twitter.go b/args/twitter.go index 18c6773..abee6d4 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -2,12 +2,24 @@ package args import ( "encoding/json" + "errors" "fmt" "strings" teetypes "github.com/masa-finance/tee-types/types" ) +var ( + ErrTwitterCountNegative = errors.New("count must be non-negative") + ErrTwitterCountTooLarge = errors.New("count must be less than or equal to 1000") + ErrTwitterMaxResultsTooLarge = errors.New("max_results must be less than or equal to 1000") + ErrTwitterMaxResultsNegative = errors.New("max_results must be non-negative") +) + +const ( + TwitterMaxResults = 1000 +) + // TwitterSearchArguments defines args for Twitter searches type TwitterSearchArguments struct { QueryType string `json:"type"` // Optional, type of search @@ -42,13 +54,17 @@ func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { // Validate validates the Twitter arguments (general validation) func (t *TwitterSearchArguments) Validate() error { // note, query is not required for all capabilities - if t.Count < 0 { - return fmt.Errorf("count must be non-negative, got: %d", t.Count) + return fmt.Errorf("%w, got: %d", ErrTwitterCountNegative, t.Count) + } + if t.Count > TwitterMaxResults { + return fmt.Errorf("%w, got: %d", ErrTwitterCountTooLarge, t.Count) } - if t.MaxResults < 0 { - return fmt.Errorf("max_results must be non-negative, got: %d", t.MaxResults) + return fmt.Errorf("%w, got: %d", ErrTwitterMaxResultsNegative, t.MaxResults) + } + if t.MaxResults > TwitterMaxResults { + return fmt.Errorf("%w, got: %d", ErrTwitterMaxResultsTooLarge, t.MaxResults) } return nil diff --git a/types/jobs.go b/types/jobs.go index 1a7d46b..753a970 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -1,13 +1,36 @@ package types import ( + "encoding/json" "fmt" "slices" + "time" "github.com/masa-finance/tee-types/pkg/util" ) type JobType string + +type JobArguments map[string]interface{} + +func (j JobArguments) Unmarshal(i interface{}) error { + d, err := json.Marshal(j) + if err != nil { + return err + } + return json.Unmarshal(d, i) +} + +type Job struct { + Type JobType `json:"type"` + Arguments JobArguments `json:"arguments"` + UUID string `json:"-"` + Nonce string `json:"quote"` + WorkerID string `json:"worker_id"` + TargetWorker string `json:"target_worker"` + Timeout time.Duration `json:"timeout"` +} + type Capability string type WorkerCapabilities map[JobType][]Capability From 791b00385d271185cb5adb1d5f42f29a10e39004 Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Tue, 30 Sep 2025 09:00:39 -0700 Subject: [PATCH 105/136] fix: default model (#27) * fix: default model * feat: support many models * fix: be more idiomatic * chore: add dynamic key * chore: use util set * chore: fix llm test * chore: add key test --- args/llm.go | 30 +++++++++++++++++++++--------- args/llm_test.go | 6 ++++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/args/llm.go b/args/llm.go index 2f5c8c6..d3e4ac8 100644 --- a/args/llm.go +++ b/args/llm.go @@ -6,6 +6,7 @@ import ( "fmt" "strconv" + "github.com/masa-finance/tee-types/pkg/util" teetypes "github.com/masa-finance/tee-types/types" ) @@ -18,10 +19,13 @@ const ( LLMDefaultMaxTokens uint = 300 LLMDefaultTemperature float64 = 0.1 LLMDefaultMultipleColumns bool = false - LLMDefaultModel string = "gemini-1.5-flash-8b" + LLMDefaultGeminiModel string = "gemini-1.5-flash-8b" + LLMDefaultClaudeModel string = "claude-3-5-haiku-latest" LLMDefaultItems uint = 1 ) +var SupportedModels = util.NewSet(LLMDefaultGeminiModel, LLMDefaultClaudeModel) + type LLMProcessorArguments struct { DatasetId string `json:"dataset_id"` Prompt string `json:"prompt"` @@ -71,13 +75,21 @@ func (l *LLMProcessorArguments) Validate() error { return nil } -func (l LLMProcessorArguments) ToLLMProcessorRequest() teetypes.LLMProcessorRequest { - return teetypes.LLMProcessorRequest{ - InputDatasetId: l.DatasetId, - Prompt: l.Prompt, - MaxTokens: l.MaxTokens, - Temperature: strconv.FormatFloat(l.Temperature, 'f', -1, 64), - MultipleColumns: LLMDefaultMultipleColumns, // overrides default in actor API - Model: LLMDefaultModel, // overrides default in actor API +func (l LLMProcessorArguments) ToLLMProcessorRequest(model string, key string) (teetypes.LLMProcessorRequest, error) { + if !SupportedModels.Contains(model) { + return teetypes.LLMProcessorRequest{}, fmt.Errorf("model %s is not supported", model) + } + if key == "" { + return teetypes.LLMProcessorRequest{}, fmt.Errorf("key is required") } + + return teetypes.LLMProcessorRequest{ + InputDatasetId: l.DatasetId, + LLMProviderApiKey: key, + Prompt: l.Prompt, + MaxTokens: l.MaxTokens, + Temperature: strconv.FormatFloat(l.Temperature, 'f', -1, 64), + MultipleColumns: LLMDefaultMultipleColumns, // overrides default in actor API + Model: model, // overrides default in actor API + }, nil } diff --git a/args/llm_test.go b/args/llm_test.go index aa35128..a9b02c2 100644 --- a/args/llm_test.go +++ b/args/llm_test.go @@ -100,13 +100,15 @@ var _ = Describe("LLMProcessorArguments", func() { MaxTokens: 42, Temperature: 0.7, } - req := llmArgs.ToLLMProcessorRequest() + req, err := llmArgs.ToLLMProcessorRequest(args.LLMDefaultGeminiModel, "api-key") + Expect(err).ToNot(HaveOccurred()) Expect(req.InputDatasetId).To(Equal("ds1")) Expect(req.Prompt).To(Equal("p")) Expect(req.MaxTokens).To(Equal(uint(42))) Expect(req.Temperature).To(Equal("0.7")) Expect(req.MultipleColumns).To(BeFalse()) - Expect(req.Model).To(Equal("gemini-1.5-flash-8b")) + Expect(req.Model).To(Equal(args.LLMDefaultGeminiModel)) + Expect(req.LLMProviderApiKey).To(Equal("api-key")) }) }) }) From 9a8730b0f3721846df408538427727b3c750197e Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Thu, 9 Oct 2025 14:36:22 -0700 Subject: [PATCH 106/136] feat: linkedin (#30) * feat: linkedin * fix: validation tests * chore: fix unmarshalling tests * chore: adds basic linkedin type tests * chore: cleanup type tests for linkedin * chore: cleanup errors * Update args/linkedin/profile/profile.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/industries/industries.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update types/linkedin/profile/profile.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * chore: fix spacing * chore: omit empty * chore: fix missing omit * fix: profile marshalling * chore: update umarshalling for mode * chore: update profile * chore: fixes test * fix: max * fix: omitempty * chore: cleanup --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .cursor/rules/tee-types_update_plan.mdc | 48 -- .gitignore | 2 + args/args.go | 10 +- args/linkedin.go | 97 --- args/linkedin/linkedin.go | 7 + args/linkedin/profile/profile.go | 132 ++++ args/linkedin/profile/profile_suite_test.go | 13 + args/linkedin/profile/profile_test.go | 222 ++++++ args/twitter.go | 18 +- args/unmarshaller.go | 42 +- args/unmarshaller_test.go | 2 +- types/jobs.go | 8 +- types/linkedin.go | 52 -- types/linkedin/experiences/experiences.go | 41 ++ types/linkedin/functions/functions.go | 121 ++++ types/linkedin/industries/industries.go | 717 ++++++++++++++++++++ types/linkedin/linkedin.go | 25 + types/linkedin/linkedin_suite_test.go | 13 + types/linkedin/linkedin_test.go | 135 ++++ types/linkedin/profile/profile.go | 261 +++++++ types/linkedin/seniorities/seniorities.go | 61 ++ types/types.go | 7 + 22 files changed, 1802 insertions(+), 232 deletions(-) delete mode 100644 .cursor/rules/tee-types_update_plan.mdc delete mode 100644 args/linkedin.go create mode 100644 args/linkedin/linkedin.go create mode 100644 args/linkedin/profile/profile.go create mode 100644 args/linkedin/profile/profile_suite_test.go create mode 100644 args/linkedin/profile/profile_test.go delete mode 100644 types/linkedin.go create mode 100644 types/linkedin/experiences/experiences.go create mode 100644 types/linkedin/functions/functions.go create mode 100644 types/linkedin/industries/industries.go create mode 100644 types/linkedin/linkedin.go create mode 100644 types/linkedin/linkedin_suite_test.go create mode 100644 types/linkedin/linkedin_test.go create mode 100644 types/linkedin/profile/profile.go create mode 100644 types/linkedin/seniorities/seniorities.go create mode 100644 types/types.go diff --git a/.cursor/rules/tee-types_update_plan.mdc b/.cursor/rules/tee-types_update_plan.mdc deleted file mode 100644 index 741eea6..0000000 --- a/.cursor/rules/tee-types_update_plan.mdc +++ /dev/null @@ -1,48 +0,0 @@ ---- -description: -globs: -alwaysApply: false ---- -# tee-types: LinkedIn Data Structures Extension - -## Overview -This plan details the required changes for the `github.com/masa-finance/tee-types` repository. These changes are a prerequisite for integrating the new LinkedIn profile fetching functionality into the `tee-worker`. The goal is to extend the existing data structures to support both profile search and full profile fetching jobs. - -## ⚠️ CRITICAL REQUIREMENTS -- **BACKWARD COMPATIBILITY**: The changes must not break existing `tee-worker` functionality that relies on `searchbyquery`. -- **CONSISTENCY**: The new data structures should align with the output of the `linkedin-scraper` SDK (`v1.0.0`). -- **CLARITY**: Use clear and descriptive naming for new structs and fields. - -## Implementation Steps - -### Phase 1: Argument Structure Update - -#### Step 1.1: Extend and Rename Job Arguments -**Objective**: Create a unified argument struct that supports both search and profile fetching. -**Files**: `args/linkedin.go` -**Action**: -- Rename the existing `LinkedInSearchArguments` struct to `LinkedInArguments`. This provides a more generic name for future extensions. -- Add a new field `PublicIdentifier string `json:"public_identifier,omitempty"` to the renamed `LinkedInArguments` struct. This will be used to specify the target profile for fetching. -**Verification**: The new `LinkedInArguments` struct contains fields for both search (`Query`, `MaxResults`, etc.) and profile fetching (`PublicIdentifier`). -**Commit**: `feat(args): extend and rename linkedin arguments for profile fetching` - -### Phase 2: Result Structure Extension - -#### Step 2.1: Define Comprehensive Profile Result -**Objective**: Create a new struct to hold the rich data from a full profile fetch. -**Files**: `types/linkedin.go` -**Action**: -- Create a new struct `LinkedInFullProfileResult`. -- This struct should include fields for all the data provided by the scraper's `GetProfile` method, such as: - - `PublicIdentifier`, `URN`, `FullName`, `Headline`, `Location`, `Summary` - - Slices for `[]Experience`, `[]Education`, `[]Skill` - - `ProfilePictureURL` -- Define helper structs for `Experience`, `Education`, and `Skill` with relevant fields (e.g., `Title`, `CompanyName` for experience; `SchoolName`, `DegreeName` for education). -**Verification**: The `LinkedInFullProfileResult` and its nested structs are defined and compile correctly. The structure matches the expected output from the `linkedin-scraper`. -**Commit**: `feat(types): add LinkedInFullProfileResult for detailed profiles` - -## Success Criteria -- ✅ `args/linkedin.go` contains the updated `LinkedInArguments` struct. -- ✅ `types/linkedin.go` contains the new `LinkedInFullProfileResult` and its associated substructures. -- ✅ The changes are non-breaking for code that uses the old `LinkedInSearchArguments` (after a name update). -- ✅ The new structures are ready to be consumed by the `tee-worker`. diff --git a/.gitignore b/.gitignore index f5f7bd6..cffb9a7 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ go.work # LLM-related files .aider* GEMINI.md + +/examples/*json \ No newline at end of file diff --git a/args/args.go b/args/args.go index ab7abe4..f090037 100644 --- a/args/args.go +++ b/args/args.go @@ -3,13 +3,17 @@ package args import ( "encoding/json" "fmt" - "strings" + + "github.com/masa-finance/tee-types/args/linkedin" + teetypes "github.com/masa-finance/tee-types/types" ) +type LinkedInProfileArguments = linkedin.ProfileArguments + // QueryTypeArgument provides a minimal structure to extract the QueryType (json "type") // This is used across different job types to determine the specific capability being requested type QueryTypeArgument struct { - QueryType string `json:"type"` + QueryType teetypes.Capability `json:"type"` } // UnmarshalJSON implements custom JSON unmarshaling with normalization @@ -20,6 +24,6 @@ func (q *QueryTypeArgument) UnmarshalJSON(data []byte) error { if err := json.Unmarshal(data, aux); err != nil { return fmt.Errorf("failed to unmarshal QueryType arguments: %w", err) } - q.QueryType = strings.ToLower(aux.QueryType) + q.QueryType = aux.QueryType return nil } diff --git a/args/linkedin.go b/args/linkedin.go deleted file mode 100644 index dc3ba93..0000000 --- a/args/linkedin.go +++ /dev/null @@ -1,97 +0,0 @@ -package args - -import ( - "encoding/json" - "fmt" - "strings" - - "github.com/masa-finance/tee-types/pkg/util" - teetypes "github.com/masa-finance/tee-types/types" -) - -// LinkedInArguments defines args for LinkedIn operations -type LinkedInArguments struct { - QueryType string `json:"type"` // "searchbyquery", "getprofile" - Query string `json:"query"` // Keywords for search or username for profile - PublicIdentifier string `json:"public_identifier,omitempty"` - NetworkFilters []string `json:"network_filters,omitempty"` // ["F", "S", "O"] - First, Second, Other (default: all) - MaxResults int `json:"max_results"` // Maximum number of results to return - Start int `json:"start"` // Pagination start offset -} - -// UnmarshalJSON implements custom JSON unmarshaling with validation -func (l *LinkedInArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias LinkedInArguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(l), - } - - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal LinkedIn arguments: %w", err) - } - - // Normalize QueryType to lowercase - l.QueryType = strings.ToLower(l.QueryType) - - return l.Validate() -} - -// Validate validates the LinkedIn arguments (general validation) -func (l *LinkedInArguments) Validate() error { - // Note: QueryType is not required for all capabilities, similar to Twitter pattern - // Query is also not required for all capabilities - - if l.MaxResults < 0 { - return fmt.Errorf("max_results must be non-negative, got: %d", l.MaxResults) - } - - if l.Start < 0 { - return fmt.Errorf("start must be non-negative, got: %d", l.Start) - } - - return nil -} - -// ValidateForJobType validates LinkedIn arguments for a specific job type -func (l *LinkedInArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := l.Validate(); err != nil { - return err - } - - // Validate QueryType against job-specific capabilities - return jobType.ValidateCapability(teetypes.Capability(l.QueryType)) -} - -// GetCapability returns the QueryType as a typed Capability -func (l *LinkedInArguments) GetCapability() teetypes.Capability { - return teetypes.Capability(l.QueryType) -} - -// IsSearchOperation returns true if this is a search operation -func (l *LinkedInArguments) IsSearchOperation() bool { - capability := l.GetCapability() - return capability == teetypes.CapSearchByQuery -} - -// IsProfileOperation returns true if this is a profile operation -func (l *LinkedInArguments) IsProfileOperation() bool { - capability := l.GetCapability() - return capability == teetypes.CapGetProfile -} - -// HasNetworkFilters returns true if network filters are specified -func (l *LinkedInArguments) HasNetworkFilters() bool { - return len(l.NetworkFilters) > 0 -} - -// GetEffectiveMaxResults returns the effective maximum results, defaulting to a reasonable limit -func (l *LinkedInArguments) GetEffectiveMaxResults() int { - return util.Max(l.MaxResults, 10) -} - -// LinkedInSearchArguments is an alias for LinkedInArguments for backward compatibility. -// Deprecated: use LinkedInArguments instead. -type LinkedInSearchArguments = LinkedInArguments diff --git a/args/linkedin/linkedin.go b/args/linkedin/linkedin.go new file mode 100644 index 0000000..92ed3e7 --- /dev/null +++ b/args/linkedin/linkedin.go @@ -0,0 +1,7 @@ +package linkedin + +import ( + "github.com/masa-finance/tee-types/args/linkedin/profile" +) + +type ProfileArguments = profile.Arguments diff --git a/args/linkedin/profile/profile.go b/args/linkedin/profile/profile.go new file mode 100644 index 0000000..399caa0 --- /dev/null +++ b/args/linkedin/profile/profile.go @@ -0,0 +1,132 @@ +package profile + +import ( + "encoding/json" + "errors" + "fmt" + + teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-types/types/linkedin/experiences" + "github.com/masa-finance/tee-types/types/linkedin/functions" + "github.com/masa-finance/tee-types/types/linkedin/industries" + "github.com/masa-finance/tee-types/types/linkedin/profile" + "github.com/masa-finance/tee-types/types/linkedin/seniorities" +) + +var ( + ErrScraperModeNotSupported = errors.New("scraper mode not supported") + ErrMaxItemsTooLarge = errors.New("max items must be less than or equal to 100") + ErrExperienceNotSupported = errors.New("years of experience not supported") + ErrSeniorityNotSupported = errors.New("seniority level not supported") + ErrFunctionNotSupported = errors.New("function not supported") + ErrIndustryNotSupported = errors.New("industry not supported") +) + +const ( + DefaultMaxItems = 10 + DefaultScraperMode = profile.ScraperModeShort + MaxItems = 1000 // 2500 on the actor, but we will run over 1MB memory limit on responses +) + +// Arguments defines args for LinkedIn profile operations +type Arguments struct { + QueryType teetypes.Capability `json:"type"` + ScraperMode profile.ScraperMode `json:"profileScraperMode"` + Query string `json:"searchQuery"` + MaxItems uint `json:"maxItems"` + Locations []string `json:"locations,omitempty"` + CurrentCompanies []string `json:"currentCompanies,omitempty"` + PastCompanies []string `json:"pastCompanies,omitempty"` + CurrentJobTitles []string `json:"currentJobTitles,omitempty"` + PastJobTitles []string `json:"pastJobTitles,omitempty"` + Schools []string `json:"schools,omitempty"` + YearsOfExperience []experiences.Id `json:"yearsOfExperienceIds,omitempty"` + YearsAtCurrentCompany []experiences.Id `json:"yearsAtCurrentCompanyIds,omitempty"` + SeniorityLevels []seniorities.Id `json:"seniorityLevelIds,omitempty"` + Functions []functions.Id `json:"functionIds,omitempty"` + Industries []industries.Id `json:"industryIds,omitempty"` + FirstNames []string `json:"firstNames,omitempty"` + LastNames []string `json:"lastNames,omitempty"` + RecentlyChangedJobs bool `json:"recentlyChangedJobs,omitempty"` + StartPage uint `json:"startPage,omitempty"` +} + +func (a *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct { + *Alias + }{ + Alias: (*Alias)(a), + } + + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("failed to unmarshal LinkedIn profile arguments: %w", err) + } + + a.setDefaultValues() + + return a.Validate() +} + +func (a *Arguments) setDefaultValues() { + if a.MaxItems == 0 { + a.MaxItems = DefaultMaxItems + } + if a.ScraperMode == "" { + a.ScraperMode = DefaultScraperMode + } +} + +func (a *Arguments) Validate() error { + var errs []error + + if a.MaxItems > MaxItems { + errs = append(errs, ErrMaxItemsTooLarge) + } + if !profile.AllScraperModes.Contains(a.ScraperMode) { + errs = append(errs, ErrScraperModeNotSupported) + } + for _, yoe := range a.YearsOfExperience { + if !experiences.All.Contains(yoe) { + errs = append(errs, fmt.Errorf("%w: %v", ErrExperienceNotSupported, yoe)) + } + } + for _, yac := range a.YearsAtCurrentCompany { + if !experiences.All.Contains(yac) { + errs = append(errs, fmt.Errorf("%w: %v", ErrExperienceNotSupported, yac)) + } + } + for _, sl := range a.SeniorityLevels { + if !seniorities.All.Contains(sl) { + errs = append(errs, fmt.Errorf("%w: %v", ErrSeniorityNotSupported, sl)) + } + } + for _, f := range a.Functions { + if !functions.All.Contains(f) { + errs = append(errs, fmt.Errorf("%w: %v", ErrFunctionNotSupported, f)) + } + } + for _, i := range a.Industries { + if !industries.All.Contains(i) { + errs = append(errs, fmt.Errorf("%w: %v", ErrIndustryNotSupported, i)) + } + } + + if len(errs) > 0 { + return errors.Join(errs...) + } + + return nil +} + +func (a *Arguments) GetCapability() teetypes.Capability { + return a.QueryType +} + +func (a *Arguments) ValidateForJobType(jobType teetypes.JobType) error { + if err := a.Validate(); err != nil { + return err + } + + return jobType.ValidateCapability(a.QueryType) +} diff --git a/args/linkedin/profile/profile_suite_test.go b/args/linkedin/profile/profile_suite_test.go new file mode 100644 index 0000000..713e96d --- /dev/null +++ b/args/linkedin/profile/profile_suite_test.go @@ -0,0 +1,13 @@ +package profile_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/args/linkedin/profile/profile_test.go b/args/linkedin/profile/profile_test.go new file mode 100644 index 0000000..6c9e5de --- /dev/null +++ b/args/linkedin/profile/profile_test.go @@ -0,0 +1,222 @@ +package profile_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-types/args/linkedin/profile" + "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-types/types/linkedin/experiences" + "github.com/masa-finance/tee-types/types/linkedin/functions" + "github.com/masa-finance/tee-types/types/linkedin/industries" + profiletypes "github.com/masa-finance/tee-types/types/linkedin/profile" + "github.com/masa-finance/tee-types/types/linkedin/seniorities" +) + +var _ = Describe("LinkedIn Profile Arguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should set default values", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + } + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.MaxItems).To(Equal(uint(10))) + Expect(args.ScraperMode).To(Equal(profiletypes.ScraperModeShort)) + }) + + It("should override default values", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + MaxItems: 50, + ScraperMode: profiletypes.ScraperModeFull, + } + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.MaxItems).To(Equal(uint(50))) + Expect(args.ScraperMode).To(Equal(profiletypes.ScraperModeFull)) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + YearsOfExperience: []experiences.Id{experiences.ThreeToFiveYears}, + SeniorityLevels: []seniorities.Id{seniorities.Senior}, + Functions: []functions.Id{functions.Engineering}, + Industries: []industries.Id{industries.SoftwareDevelopment}, + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail with max items too large", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 1500, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrMaxItemsTooLarge)).To(BeTrue()) + }) + + It("should fail with invalid scraper mode", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: "InvalidMode", + MaxItems: 10, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrScraperModeNotSupported)).To(BeTrue()) + }) + + It("should fail with invalid years of experience", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + YearsOfExperience: []experiences.Id{"invalid"}, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrExperienceNotSupported)).To(BeTrue()) + + }) + + It("should fail with invalid years at current company", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + YearsAtCurrentCompany: []experiences.Id{"invalid"}, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrExperienceNotSupported)).To(BeTrue()) + + }) + + It("should fail with invalid seniority level", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + SeniorityLevels: []seniorities.Id{"invalid"}, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrSeniorityNotSupported)).To(BeTrue()) + }) + + It("should fail with invalid function", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + Functions: []functions.Id{"invalid"}, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrFunctionNotSupported)).To(BeTrue()) + + }) + + It("should fail with invalid industry", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + Industries: []industries.Id{"invalid"}, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, profile.ErrIndustryNotSupported)).To(BeTrue()) + + }) + + It("should handle multiple validation errors", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: "InvalidMode", + MaxItems: 1500, + YearsOfExperience: []experiences.Id{"invalid"}, + SeniorityLevels: []seniorities.Id{"invalid"}, + } + err := args.Validate() + Expect(err).To(HaveOccurred()) + // Should contain multiple error messages + Expect(errors.Is(err, profile.ErrMaxItemsTooLarge)).To(BeTrue()) + Expect(errors.Is(err, profile.ErrScraperModeNotSupported)).To(BeTrue()) + Expect(errors.Is(err, profile.ErrExperienceNotSupported)).To(BeTrue()) + Expect(errors.Is(err, profile.ErrSeniorityNotSupported)).To(BeTrue()) + }) + }) + + Describe("GetCapability", func() { + It("should return the query type", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + } + Expect(args.GetCapability()).To(Equal(types.CapSearchByProfile)) + }) + }) + + Describe("ValidateForJobType", func() { + It("should succeed with valid job type and capability", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + } + err := args.ValidateForJobType(types.LinkedInJob) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail with invalid job type", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByQuery, + Query: "software engineer", + ScraperMode: profiletypes.ScraperModeShort, + MaxItems: 10, + } + err := args.ValidateForJobType(types.LinkedInJob) + Expect(err).To(HaveOccurred()) + }) + + It("should fail if base validation fails", func() { + args := args.LinkedInProfileArguments{ + QueryType: types.CapSearchByProfile, + Query: "software engineer", + ScraperMode: "InvalidMode", + MaxItems: 10, + } + err := args.ValidateForJobType(types.LinkedInJob) + Expect(err).To(HaveOccurred()) + }) + }) +}) diff --git a/args/twitter.go b/args/twitter.go index abee6d4..6c08f65 100644 --- a/args/twitter.go +++ b/args/twitter.go @@ -4,7 +4,6 @@ import ( "encoding/json" "errors" "fmt" - "strings" teetypes "github.com/masa-finance/tee-types/types" ) @@ -22,13 +21,13 @@ const ( // TwitterSearchArguments defines args for Twitter searches type TwitterSearchArguments struct { - QueryType string `json:"type"` // Optional, type of search - Query string `json:"query"` // Username or search query - Count int `json:"count"` - StartTime string `json:"start_time"` // Optional ISO timestamp - EndTime string `json:"end_time"` // Optional ISO timestamp - MaxResults int `json:"max_results"` // Optional, max number of results - NextCursor string `json:"next_cursor"` + QueryType teetypes.Capability `json:"type"` // Optional, type of search + Query string `json:"query"` // Username or search query + Count int `json:"count"` + StartTime string `json:"start_time"` // Optional ISO timestamp + EndTime string `json:"end_time"` // Optional ISO timestamp + MaxResults int `json:"max_results"` // Optional, max number of results + NextCursor string `json:"next_cursor"` } // UnmarshalJSON implements custom JSON unmarshaling with validation @@ -45,9 +44,6 @@ func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal Twitter arguments: %w", err) } - // Normalize QueryType to lowercase - t.QueryType = strings.ToLower(t.QueryType) - return t.Validate() } diff --git a/args/unmarshaller.go b/args/unmarshaller.go index 1d3c26d..9057885 100644 --- a/args/unmarshaller.go +++ b/args/unmarshaller.go @@ -3,7 +3,6 @@ package args import ( "encoding/json" "fmt" - "strings" "github.com/masa-finance/tee-types/types" ) @@ -55,16 +54,15 @@ func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { if err := unmarshalToStruct(args, minimal); err != nil { return nil, fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) } - capability := types.Capability(strings.ToLower(minimal.QueryType)) - if capability == types.CapEmpty { + if minimal.QueryType == types.CapEmpty { defaultCap, exists := types.JobDefaultCapabilityMap[types.TiktokJob] if !exists { return nil, fmt.Errorf("no default capability configured for job type: %s", types.TiktokJob) } - capability = defaultCap + minimal.QueryType = defaultCap } - switch capability { + switch minimal.QueryType { case types.CapSearchByQuery: searchArgs := &TikTokSearchByQueryArguments{} if err := unmarshalToStruct(args, searchArgs); err != nil { @@ -93,7 +91,7 @@ func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { } return transcriptionArgs, nil default: - return nil, fmt.Errorf("unknown tiktok type: %s", capability) + return nil, fmt.Errorf("unknown tiktok type: %s", minimal.QueryType) } } @@ -106,7 +104,7 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi // If no QueryType is specified, use the default capability for this job type if twitterArgs.QueryType == "" { if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { - twitterArgs.QueryType = string(defaultCap) + twitterArgs.QueryType = defaultCap } } @@ -118,25 +116,31 @@ func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*Twi return twitterArgs, nil } -func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (*LinkedInArguments, error) { - linkedInArgs := &LinkedInArguments{} - if err := unmarshalToStruct(args, linkedInArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) +func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { + minimal := &QueryTypeArgument{} + if err := unmarshalToStruct(args, minimal); err != nil { + return nil, fmt.Errorf("failed to unmarshal LinkedIn arguments: %w", err) } - // If no QueryType is specified, use the default capability for this job type - if linkedInArgs.QueryType == "" { + if minimal.QueryType == types.CapEmpty { if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { - linkedInArgs.QueryType = string(defaultCap) + minimal.QueryType = defaultCap } } - // Perform job-type-specific validation for LinkedIn - if err := linkedInArgs.ValidateForJobType(jobType); err != nil { - return nil, fmt.Errorf("linkedin job validation failed: %w", err) + switch minimal.QueryType { + case types.CapSearchByProfile: + linkedInArgs := &LinkedInProfileArguments{} + if err := unmarshalToStruct(args, linkedInArgs); err != nil { + return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) + } + if err := linkedInArgs.ValidateForJobType(jobType); err != nil { + return nil, fmt.Errorf("linkedin job validation failed: %w", err) + } + return linkedInArgs, nil + default: + return nil, fmt.Errorf("unknown linkedin type: %s", minimal.QueryType) } - - return linkedInArgs, nil } func unmarshalRedditArguments(jobType types.JobType, args map[string]any) (*RedditArguments, error) { diff --git a/args/unmarshaller_test.go b/args/unmarshaller_test.go index 4231cbd..03baaca 100644 --- a/args/unmarshaller_test.go +++ b/args/unmarshaller_test.go @@ -51,7 +51,7 @@ var _ = Describe("Unmarshaller", func() { Expect(err).ToNot(HaveOccurred()) twitterArgs, ok := jobArgs.(*args.TwitterSearchArguments) Expect(ok).To(BeTrue()) - Expect(twitterArgs.QueryType).To(Equal("searchbyquery")) + Expect(twitterArgs.QueryType).To(Equal(types.CapSearchByQuery)) Expect(twitterArgs.Query).To(Equal("golang")) Expect(twitterArgs.Count).To(Equal(10)) }) diff --git a/types/jobs.go b/types/jobs.go index 753a970..628cb46 100644 --- a/types/jobs.go +++ b/types/jobs.go @@ -111,7 +111,6 @@ const ( var ( AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} - AlwaysAvailableLinkedInCaps = []Capability{CapSearchByQuery, CapGetProfile, CapEmpty} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ @@ -142,6 +141,9 @@ var ( // WebCaps are all the Web capabilities (only available with Apify) WebCaps = []Capability{CapScraper, CapEmpty} + + // LinkedInCaps are all the LinkedIn capabilities (only available with Apify) + LinkedInCaps = []Capability{CapSearchByProfile} ) // JobCapabilityMap defines which capabilities are valid for each job type @@ -163,6 +165,9 @@ var JobCapabilityMap = map[JobType][]Capability{ // Web job capabilities WebJob: WebCaps, + // LinkedIn job capabilities + LinkedInJob: LinkedInCaps, + // TikTok job capabilities TiktokJob: combineCapabilities( AlwaysAvailableTiktokCaps, @@ -186,4 +191,5 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TiktokJob: CapTranscription, RedditJob: CapScrapeUrls, TelemetryJob: CapTelemetry, + LinkedInJob: CapSearchByProfile, } diff --git a/types/linkedin.go b/types/linkedin.go deleted file mode 100644 index b5c050a..0000000 --- a/types/linkedin.go +++ /dev/null @@ -1,52 +0,0 @@ -// Package types provides shared types between tee-worker and tee-indexer -package types - -// LinkedInProfileResult defines the structure of a LinkedIn profile search result -type LinkedInProfileResult struct { - PublicIdentifier string `json:"public_identifier"` // Username/slug in profile URL - URN string `json:"urn"` // LinkedIn's unique resource name - FullName string `json:"full_name"` // Person's full name - Headline string `json:"headline"` // Professional headline/title - Location string `json:"location"` // Geographic location - ProfileURL string `json:"profile_url"` // Full LinkedIn profile URL - Degree string `json:"degree,omitempty"` // Connection degree (1st, 2nd, etc.) -} - -// Experience defines the structure for a single entry in a user's work experience -type Experience struct { - Title string `json:"title"` - CompanyName string `json:"company_name"` - Location string `json:"location,omitempty"` - StartDate string `json:"start_date,omitempty"` - EndDate string `json:"end_date,omitempty"` - Description string `json:"description,omitempty"` -} - -// Education defines the structure for a single entry in a user's education history -type Education struct { - SchoolName string `json:"school_name"` - DegreeName string `json:"degree_name,omitempty"` - FieldOfStudy string `json:"field_of_study,omitempty"` - StartDate string `json:"start_date,omitempty"` - EndDate string `json:"end_date,omitempty"` - Description string `json:"description,omitempty"` -} - -// Skill defines the structure for a single skill entry -type Skill struct { - Name string `json:"name"` -} - -// LinkedInFullProfileResult defines the structure for a detailed LinkedIn profile -type LinkedInFullProfileResult struct { - PublicIdentifier string `json:"public_identifier"` - URN string `json:"urn"` - FullName string `json:"full_name"` - Headline string `json:"headline"` - Location string `json:"location"` - Summary string `json:"summary,omitempty"` - ProfilePictureURL string `json:"profile_picture_url,omitempty"` - Experiences []Experience `json:"experiences,omitempty"` - Education []Education `json:"education,omitempty"` - Skills []Skill `json:"skills,omitempty"` -} diff --git a/types/linkedin/experiences/experiences.go b/types/linkedin/experiences/experiences.go new file mode 100644 index 0000000..e714b58 --- /dev/null +++ b/types/linkedin/experiences/experiences.go @@ -0,0 +1,41 @@ +package experiences + +import "github.com/masa-finance/tee-types/pkg/util" + +// id represents a LinkedIn experience level identifier +type Id string + +// Experience level constants +const ( + LessThanAYear Id = "1" + OneToTwoYears Id = "2" + ThreeToFiveYears Id = "3" + SixToTenYears Id = "4" + MoreThanTenYears Id = "5" +) + +var All = util.NewSet( + LessThanAYear, + OneToTwoYears, + ThreeToFiveYears, + SixToTenYears, + MoreThanTenYears, +) + +type ExperiencesConfig struct { + All util.Set[Id] + LessThanAYear Id + OneToTwoYears Id + ThreeToFiveYears Id + SixToTenYears Id + MoreThanTenYears Id +} + +var Experiences = ExperiencesConfig{ + All: *All, + LessThanAYear: LessThanAYear, + OneToTwoYears: OneToTwoYears, + ThreeToFiveYears: ThreeToFiveYears, + SixToTenYears: SixToTenYears, + MoreThanTenYears: MoreThanTenYears, +} diff --git a/types/linkedin/functions/functions.go b/types/linkedin/functions/functions.go new file mode 100644 index 0000000..cd08f33 --- /dev/null +++ b/types/linkedin/functions/functions.go @@ -0,0 +1,121 @@ +package functions + +import "github.com/masa-finance/tee-types/pkg/util" + +// id represents a LinkedIn function identifier +type Id string + +// Function constants +const ( + Accounting Id = "1" + Administrative Id = "2" + ArtsAndDesign Id = "3" + BusinessDevelopment Id = "4" + CommunityAndSocialServices Id = "5" + Consulting Id = "6" + Education Id = "7" + Engineering Id = "8" + Entrepreneurship Id = "9" + Finance Id = "10" + HealthcareServices Id = "11" + HumanResources Id = "12" + InformationTechnology Id = "13" + Legal Id = "14" + Marketing Id = "15" + MediaAndCommunication Id = "16" + MilitaryAndProtectiveServices Id = "17" + Operations Id = "18" + ProductManagement Id = "19" + ProgramAndProjectManagement Id = "20" + Purchasing Id = "21" + QualityAssurance Id = "22" + RealEstate Id = "23" + Research Id = "24" + Sales Id = "25" +) + +var All = util.NewSet( + Accounting, + Administrative, + ArtsAndDesign, + BusinessDevelopment, + CommunityAndSocialServices, + Consulting, + Education, + Engineering, + Entrepreneurship, + Finance, + HealthcareServices, + HumanResources, + InformationTechnology, + Legal, + Marketing, + MediaAndCommunication, + MilitaryAndProtectiveServices, + Operations, + ProductManagement, + ProgramAndProjectManagement, + Purchasing, + QualityAssurance, + RealEstate, + Research, + Sales, +) + +type FunctionsConfig struct { + All util.Set[Id] + Accounting Id + Administrative Id + ArtsAndDesign Id + BusinessDevelopment Id + CommunityAndSocialServices Id + Consulting Id + Education Id + Engineering Id + Entrepreneurship Id + Finance Id + HealthcareServices Id + HumanResources Id + InformationTechnology Id + Legal Id + Marketing Id + MediaAndCommunication Id + MilitaryAndProtectiveServices Id + Operations Id + ProductManagement Id + ProgramAndProjectManagement Id + Purchasing Id + QualityAssurance Id + RealEstate Id + Research Id + Sales Id +} + +var Functions = FunctionsConfig{ + All: *All, + Accounting: Accounting, + Administrative: Administrative, + ArtsAndDesign: ArtsAndDesign, + BusinessDevelopment: BusinessDevelopment, + CommunityAndSocialServices: CommunityAndSocialServices, + Consulting: Consulting, + Education: Education, + Engineering: Engineering, + Entrepreneurship: Entrepreneurship, + Finance: Finance, + HealthcareServices: HealthcareServices, + HumanResources: HumanResources, + InformationTechnology: InformationTechnology, + Legal: Legal, + Marketing: Marketing, + MediaAndCommunication: MediaAndCommunication, + MilitaryAndProtectiveServices: MilitaryAndProtectiveServices, + Operations: Operations, + ProductManagement: ProductManagement, + ProgramAndProjectManagement: ProgramAndProjectManagement, + Purchasing: Purchasing, + QualityAssurance: QualityAssurance, + RealEstate: RealEstate, + Research: Research, + Sales: Sales, +} diff --git a/types/linkedin/industries/industries.go b/types/linkedin/industries/industries.go new file mode 100644 index 0000000..772ad9d --- /dev/null +++ b/types/linkedin/industries/industries.go @@ -0,0 +1,717 @@ +package industries + +import "github.com/masa-finance/tee-types/pkg/util" + +// Id represents a LinkedIn industry identifier +type Id string + +// Industry constants +const ( + // Technology & Software + SoftwareDevelopment Id = "4" + ComputerHardwareManufacturing Id = "3" + ComputerNetworkingProducts Id = "5" + ItServicesAndItConsulting Id = "96" + ComputerAndNetworkSecurity Id = "118" + Telecommunications Id = "8" + WirelessServices Id = "119" + TechnologyInformationAndInternet Id = "6" + DataInfrastructureAndAnalytics Id = "2458" + InformationServices Id = "84" + InternetPublishing Id = "3132" + SocialNetworkingPlatforms Id = "3127" + ComputerGames Id = "109" + MobileGamingApps Id = "3131" + BlockchainServices Id = "3134" + BusinessIntelligencePlatforms Id = "3128" + + // Financial Services + FinancialServices Id = "43" + Banking Id = "41" + Insurance Id = "42" + InvestmentBanking Id = "45" + CapitalMarkets Id = "129" + VentureCapitalAndPrivateEquityPrincipals Id = "106" + SecuritiesAndCommodityExchanges Id = "1713" + FundsAndTrusts Id = "1742" + + // Healthcare & Medical + Hospitals Id = "2081" + MedicalPractices Id = "13" + MedicalEquipmentManufacturing Id = "17" + PublicHealth Id = "2358" + VeterinaryServices Id = "16" + BiotechnologyResearch Id = "12" + + // Manufacturing + Manufacturing Id = "25" + ComputersAndElectronicsManufacturing Id = "24" + SemiconductorManufacturing Id = "7" + MachineryManufacturing Id = "55" + IndustrialMachineryManufacturing Id = "135" + FoodAndBeverageManufacturing Id = "23" + TextileManufacturing Id = "60" + MotorVehicleManufacturing Id = "53" + MotorVehiclePartsManufacturing Id = "1042" + AviationAndAerospaceComponentManufacturing Id = "52" + DefenseAndSpaceManufacturing Id = "1" + PlasticsManufacturing Id = "117" + RubberProductsManufacturing Id = "763" + PaperAndForestProductManufacturing Id = "61" + WoodProductManufacturing Id = "784" + FurnitureAndHomeFurnishingsManufacturing Id = "26" + SportingGoodsManufacturing Id = "20" + PrintingServices Id = "83" + + // Retail & Consumer Goods + Retail Id = "27" + RetailGroceries Id = "22" + OnlineAndMailOrderRetail Id = "1445" + RetailApparelAndFashion Id = "19" + RetailAppliancesElectricalAndElectronicEquipment Id = "1319" + RetailBooksAndPrintedNews Id = "1409" + RetailBuildingMaterialsAndGardenEquipment Id = "1324" + RetailFurnitureAndHomeFurnishings Id = "1309" + RetailHealthAndPersonalCareProducts Id = "1359" + RetailLuxuryGoodsAndJewelry Id = "143" + RetailMotorVehicles Id = "1292" + RetailOfficeEquipment Id = "138" + RetailOfficeSuppliesAndGifts Id = "1424" + + // Professional Services + ProfessionalServices Id = "1810" + Accounting Id = "47" + LegalServices Id = "10" + LawPractice Id = "9" + BusinessConsultingAndServices Id = "11" + StrategicManagementServices Id = "102" + HumanResourcesServices Id = "137" + MarketingServices Id = "1862" + AdvertisingServices Id = "80" + PublicRelationsAndCommunicationsServices Id = "98" + MarketResearch Id = "97" + ArchitectureAndPlanning Id = "50" + DesignServices Id = "99" + GraphicDesign Id = "140" + InteriorDesign Id = "3126" + EngineeringServices Id = "3242" + EnvironmentalServices Id = "86" + ResearchServices Id = "70" + ThinkTanks Id = "130" + Photography Id = "136" + TranslationAndLocalization Id = "108" + WritingAndEditing Id = "103" + + // Education + Education Id = "1999" + HigherEducation Id = "68" + ProfessionalTrainingAndCoaching Id = "105" + SportsAndRecreationInstruction Id = "2027" + + // Transportation & Logistics + TransportationLogisticsSupplyChainAndStorage Id = "116" + AirlinesAndAviation Id = "94" + FreightAndPackageTransportation Id = "87" + MaritimeTransportation Id = "95" + RailTransportation Id = "1481" + TruckTransportation Id = "92" + WarehousingAndStorage Id = "93" + PostalServices Id = "1573" + + // Energy & Utilities + Utilities Id = "59" + ElectricPowerGeneration Id = "383" + RenewableEnergyPowerGeneration Id = "3240" + OilAndGas Id = "57" + Mining Id = "56" + OilGasAndMining Id = "332" + + // Media & Entertainment + TechnologyInformationAndMedia Id = "1594" + BroadcastMediaProductionAndDistribution Id = "36" + RadioAndTelevisionBroadcasting Id = "1633" + MoviesVideosAndSound Id = "35" + MediaProduction Id = "126" + SoundRecording Id = "1623" + BookAndPeriodicalPublishing Id = "82" + NewspaperPublishing Id = "81" + PeriodicalPublishing Id = "1600" + EntertainmentProviders Id = "28" + ArtistsAndWriters Id = "38" + Musicians Id = "115" + + // Construction & Real Estate + Construction Id = "48" + CivilEngineering Id = "51" + RealEstate Id = "44" + RealEstateAgentsAndBrokers Id = "1770" + + // Hospitality & Services + Hospitality Id = "31" + HotelsAndMotels Id = "2194" + Restaurants Id = "32" + FoodAndBeverageServices Id = "34" + TravelArrangements Id = "30" + EventsServices Id = "110" + WellnessAndFitnessServices Id = "124" + ConsumerServices Id = "91" + + // Government & Non-Profit + ArmedForces Id = "71" + GovernmentRelationsServices Id = "148" + NonProfitOrganizations Id = "100" + CivicAndSocialOrganizations Id = "90" + PoliticalOrganizations Id = "107" + ProfessionalOrganizations Id = "1911" + Fundraising Id = "101" + + // Wholesale & Distribution + Wholesale Id = "133" + WholesaleImportAndExport Id = "134" + WholesaleComputerEquipment Id = "1157" + WholesaleFoodAndBeverage Id = "1231" + WholesaleBuildingMaterials Id = "49" + WholesaleMachinery Id = "1187" + WholesaleMotorVehiclesAndParts Id = "1128" + + // Other Services + StaffingAndRecruiting Id = "104" + ExecutiveSearchServices Id = "1923" + OfficeAdministration Id = "1916" + SecurityAndInvestigations Id = "121" + EquipmentRentalServices Id = "1779" + Libraries Id = "85" +) + +var All = util.NewSet( + // Technology & Software + SoftwareDevelopment, + ComputerHardwareManufacturing, + ComputerNetworkingProducts, + ItServicesAndItConsulting, + ComputerAndNetworkSecurity, + Telecommunications, + WirelessServices, + TechnologyInformationAndInternet, + DataInfrastructureAndAnalytics, + InformationServices, + InternetPublishing, + SocialNetworkingPlatforms, + ComputerGames, + MobileGamingApps, + BlockchainServices, + BusinessIntelligencePlatforms, + + // Financial Services + FinancialServices, + Banking, + Insurance, + InvestmentBanking, + CapitalMarkets, + VentureCapitalAndPrivateEquityPrincipals, + SecuritiesAndCommodityExchanges, + FundsAndTrusts, + + // Healthcare & Medical + Hospitals, + MedicalPractices, + MedicalEquipmentManufacturing, + PublicHealth, + VeterinaryServices, + BiotechnologyResearch, + + // Manufacturing + Manufacturing, + ComputersAndElectronicsManufacturing, + SemiconductorManufacturing, + MachineryManufacturing, + IndustrialMachineryManufacturing, + FoodAndBeverageManufacturing, + TextileManufacturing, + MotorVehicleManufacturing, + MotorVehiclePartsManufacturing, + AviationAndAerospaceComponentManufacturing, + DefenseAndSpaceManufacturing, + PlasticsManufacturing, + RubberProductsManufacturing, + PaperAndForestProductManufacturing, + WoodProductManufacturing, + FurnitureAndHomeFurnishingsManufacturing, + SportingGoodsManufacturing, + PrintingServices, + + // Retail & Consumer Goods + Retail, + RetailGroceries, + OnlineAndMailOrderRetail, + RetailApparelAndFashion, + RetailAppliancesElectricalAndElectronicEquipment, + RetailBooksAndPrintedNews, + RetailBuildingMaterialsAndGardenEquipment, + RetailFurnitureAndHomeFurnishings, + RetailHealthAndPersonalCareProducts, + RetailLuxuryGoodsAndJewelry, + RetailMotorVehicles, + RetailOfficeEquipment, + RetailOfficeSuppliesAndGifts, + + // Professional Services + ProfessionalServices, + Accounting, + LegalServices, + LawPractice, + BusinessConsultingAndServices, + StrategicManagementServices, + HumanResourcesServices, + MarketingServices, + AdvertisingServices, + PublicRelationsAndCommunicationsServices, + MarketResearch, + ArchitectureAndPlanning, + DesignServices, + GraphicDesign, + InteriorDesign, + EngineeringServices, + EnvironmentalServices, + ResearchServices, + ThinkTanks, + Photography, + TranslationAndLocalization, + WritingAndEditing, + + // Education + Education, + HigherEducation, + ProfessionalTrainingAndCoaching, + SportsAndRecreationInstruction, + + // Transportation & Logistics + TransportationLogisticsSupplyChainAndStorage, + AirlinesAndAviation, + FreightAndPackageTransportation, + MaritimeTransportation, + RailTransportation, + TruckTransportation, + WarehousingAndStorage, + PostalServices, + + // Energy & Utilities + Utilities, + ElectricPowerGeneration, + RenewableEnergyPowerGeneration, + OilAndGas, + Mining, + OilGasAndMining, + + // Media & Entertainment + TechnologyInformationAndMedia, + BroadcastMediaProductionAndDistribution, + RadioAndTelevisionBroadcasting, + MoviesVideosAndSound, + MediaProduction, + SoundRecording, + BookAndPeriodicalPublishing, + NewspaperPublishing, + PeriodicalPublishing, + EntertainmentProviders, + ArtistsAndWriters, + Musicians, + + // Construction & Real Estate + Construction, + CivilEngineering, + RealEstate, + RealEstateAgentsAndBrokers, + + // Hospitality & Services + Hospitality, + HotelsAndMotels, + Restaurants, + FoodAndBeverageServices, + TravelArrangements, + EventsServices, + WellnessAndFitnessServices, + ConsumerServices, + + // Government & Non-Profit + ArmedForces, + GovernmentRelationsServices, + NonProfitOrganizations, + CivicAndSocialOrganizations, + PoliticalOrganizations, + ProfessionalOrganizations, + Fundraising, + + // Wholesale & Distribution + Wholesale, + WholesaleImportAndExport, + WholesaleComputerEquipment, + WholesaleFoodAndBeverage, + WholesaleBuildingMaterials, + WholesaleMachinery, + WholesaleMotorVehiclesAndParts, + + // Other Services + StaffingAndRecruiting, + ExecutiveSearchServices, + OfficeAdministration, + SecurityAndInvestigations, + EquipmentRentalServices, + Libraries, +) + +type IndustriesConfig struct { + All util.Set[Id] + // Technology & Software + SoftwareDevelopment Id + ComputerHardwareManufacturing Id + ComputerNetworkingProducts Id + ItServicesAndItConsulting Id + ComputerAndNetworkSecurity Id + Telecommunications Id + WirelessServices Id + TechnologyInformationAndInternet Id + DataInfrastructureAndAnalytics Id + InformationServices Id + InternetPublishing Id + SocialNetworkingPlatforms Id + ComputerGames Id + MobileGamingApps Id + BlockchainServices Id + BusinessIntelligencePlatforms Id + + // Financial Services + FinancialServices Id + Banking Id + Insurance Id + InvestmentBanking Id + CapitalMarkets Id + VentureCapitalAndPrivateEquityPrincipals Id + SecuritiesAndCommodityExchanges Id + FundsAndTrusts Id + + // Healthcare & Medical + Hospitals Id + MedicalPractices Id + MedicalEquipmentManufacturing Id + PublicHealth Id + VeterinaryServices Id + BiotechnologyResearch Id + + // Manufacturing + Manufacturing Id + ComputersAndElectronicsManufacturing Id + SemiconductorManufacturing Id + MachineryManufacturing Id + IndustrialMachineryManufacturing Id + FoodAndBeverageManufacturing Id + TextileManufacturing Id + MotorVehicleManufacturing Id + MotorVehiclePartsManufacturing Id + AviationAndAerospaceComponentManufacturing Id + DefenseAndSpaceManufacturing Id + PlasticsManufacturing Id + RubberProductsManufacturing Id + PaperAndForestProductManufacturing Id + WoodProductManufacturing Id + FurnitureAndHomeFurnishingsManufacturing Id + SportingGoodsManufacturing Id + PrintingServices Id + + // Retail & Consumer Goods + Retail Id + RetailGroceries Id + OnlineAndMailOrderRetail Id + RetailApparelAndFashion Id + RetailAppliancesElectricalAndElectronicEquipment Id + RetailBooksAndPrintedNews Id + RetailBuildingMaterialsAndGardenEquipment Id + RetailFurnitureAndHomeFurnishings Id + RetailHealthAndPersonalCareProducts Id + RetailLuxuryGoodsAndJewelry Id + RetailMotorVehicles Id + RetailOfficeEquipment Id + RetailOfficeSuppliesAndGifts Id + + // Professional Services + ProfessionalServices Id + Accounting Id + LegalServices Id + LawPractice Id + BusinessConsultingAndServices Id + StrategicManagementServices Id + HumanResourcesServices Id + MarketingServices Id + AdvertisingServices Id + PublicRelationsAndCommunicationsServices Id + MarketResearch Id + ArchitectureAndPlanning Id + DesignServices Id + GraphicDesign Id + InteriorDesign Id + EngineeringServices Id + EnvironmentalServices Id + ResearchServices Id + ThinkTanks Id + Photography Id + TranslationAndLocalization Id + WritingAndEditing Id + + // Education + Education Id + HigherEducation Id + ProfessionalTrainingAndCoaching Id + SportsAndRecreationInstruction Id + + // Transportation & Logistics + TransportationLogisticsSupplyChainAndStorage Id + AirlinesAndAviation Id + FreightAndPackageTransportation Id + MaritimeTransportation Id + RailTransportation Id + TruckTransportation Id + WarehousingAndStorage Id + PostalServices Id + + // Energy & Utilities + Utilities Id + ElectricPowerGeneration Id + RenewableEnergyPowerGeneration Id + OilAndGas Id + Mining Id + OilGasAndMining Id + + // Media & Entertainment + TechnologyInformationAndMedia Id + BroadcastMediaProductionAndDistribution Id + RadioAndTelevisionBroadcasting Id + MoviesVideosAndSound Id + MediaProduction Id + SoundRecording Id + BookAndPeriodicalPublishing Id + NewspaperPublishing Id + PeriodicalPublishing Id + EntertainmentProviders Id + ArtistsAndWriters Id + Musicians Id + + // Construction & Real Estate + Construction Id + CivilEngineering Id + RealEstate Id + RealEstateAgentsAndBrokers Id + + // Hospitality & Services + Hospitality Id + HotelsAndMotels Id + Restaurants Id + FoodAndBeverageServices Id + TravelArrangements Id + EventsServices Id + WellnessAndFitnessServices Id + ConsumerServices Id + + // Government & Non-Profit + ArmedForces Id + GovernmentRelationsServices Id + NonProfitOrganizations Id + CivicAndSocialOrganizations Id + PoliticalOrganizations Id + ProfessionalOrganizations Id + Fundraising Id + + // Wholesale & Distribution + Wholesale Id + WholesaleImportAndExport Id + WholesaleComputerEquipment Id + WholesaleFoodAndBeverage Id + WholesaleBuildingMaterials Id + WholesaleMachinery Id + WholesaleMotorVehiclesAndParts Id + + // Other Services + StaffingAndRecruiting Id + ExecutiveSearchServices Id + OfficeAdministration Id + SecurityAndInvestigations Id + EquipmentRentalServices Id + Libraries Id +} + +var Industries = IndustriesConfig{ + All: *All, + // Technology & Software + SoftwareDevelopment: SoftwareDevelopment, + ComputerHardwareManufacturing: ComputerHardwareManufacturing, + ComputerNetworkingProducts: ComputerNetworkingProducts, + ItServicesAndItConsulting: ItServicesAndItConsulting, + ComputerAndNetworkSecurity: ComputerAndNetworkSecurity, + Telecommunications: Telecommunications, + WirelessServices: WirelessServices, + TechnologyInformationAndInternet: TechnologyInformationAndInternet, + DataInfrastructureAndAnalytics: DataInfrastructureAndAnalytics, + InformationServices: InformationServices, + InternetPublishing: InternetPublishing, + SocialNetworkingPlatforms: SocialNetworkingPlatforms, + ComputerGames: ComputerGames, + MobileGamingApps: MobileGamingApps, + BlockchainServices: BlockchainServices, + BusinessIntelligencePlatforms: BusinessIntelligencePlatforms, + + // Financial Services + FinancialServices: FinancialServices, + Banking: Banking, + Insurance: Insurance, + InvestmentBanking: InvestmentBanking, + CapitalMarkets: CapitalMarkets, + VentureCapitalAndPrivateEquityPrincipals: VentureCapitalAndPrivateEquityPrincipals, + SecuritiesAndCommodityExchanges: SecuritiesAndCommodityExchanges, + FundsAndTrusts: FundsAndTrusts, + + // Healthcare & Medical + Hospitals: Hospitals, + MedicalPractices: MedicalPractices, + MedicalEquipmentManufacturing: MedicalEquipmentManufacturing, + PublicHealth: PublicHealth, + VeterinaryServices: VeterinaryServices, + BiotechnologyResearch: BiotechnologyResearch, + + // Manufacturing + Manufacturing: Manufacturing, + ComputersAndElectronicsManufacturing: ComputersAndElectronicsManufacturing, + SemiconductorManufacturing: SemiconductorManufacturing, + MachineryManufacturing: MachineryManufacturing, + IndustrialMachineryManufacturing: IndustrialMachineryManufacturing, + FoodAndBeverageManufacturing: FoodAndBeverageManufacturing, + TextileManufacturing: TextileManufacturing, + MotorVehicleManufacturing: MotorVehicleManufacturing, + MotorVehiclePartsManufacturing: MotorVehiclePartsManufacturing, + AviationAndAerospaceComponentManufacturing: AviationAndAerospaceComponentManufacturing, + DefenseAndSpaceManufacturing: DefenseAndSpaceManufacturing, + PlasticsManufacturing: PlasticsManufacturing, + RubberProductsManufacturing: RubberProductsManufacturing, + PaperAndForestProductManufacturing: PaperAndForestProductManufacturing, + WoodProductManufacturing: WoodProductManufacturing, + FurnitureAndHomeFurnishingsManufacturing: FurnitureAndHomeFurnishingsManufacturing, + SportingGoodsManufacturing: SportingGoodsManufacturing, + PrintingServices: PrintingServices, + + // Retail & Consumer Goods + Retail: Retail, + RetailGroceries: RetailGroceries, + OnlineAndMailOrderRetail: OnlineAndMailOrderRetail, + RetailApparelAndFashion: RetailApparelAndFashion, + RetailAppliancesElectricalAndElectronicEquipment: RetailAppliancesElectricalAndElectronicEquipment, + RetailBooksAndPrintedNews: RetailBooksAndPrintedNews, + RetailBuildingMaterialsAndGardenEquipment: RetailBuildingMaterialsAndGardenEquipment, + RetailFurnitureAndHomeFurnishings: RetailFurnitureAndHomeFurnishings, + RetailHealthAndPersonalCareProducts: RetailHealthAndPersonalCareProducts, + RetailLuxuryGoodsAndJewelry: RetailLuxuryGoodsAndJewelry, + RetailMotorVehicles: RetailMotorVehicles, + RetailOfficeEquipment: RetailOfficeEquipment, + RetailOfficeSuppliesAndGifts: RetailOfficeSuppliesAndGifts, + + // Professional Services + ProfessionalServices: ProfessionalServices, + Accounting: Accounting, + LegalServices: LegalServices, + LawPractice: LawPractice, + BusinessConsultingAndServices: BusinessConsultingAndServices, + StrategicManagementServices: StrategicManagementServices, + HumanResourcesServices: HumanResourcesServices, + MarketingServices: MarketingServices, + AdvertisingServices: AdvertisingServices, + PublicRelationsAndCommunicationsServices: PublicRelationsAndCommunicationsServices, + MarketResearch: MarketResearch, + ArchitectureAndPlanning: ArchitectureAndPlanning, + DesignServices: DesignServices, + GraphicDesign: GraphicDesign, + InteriorDesign: InteriorDesign, + EngineeringServices: EngineeringServices, + EnvironmentalServices: EnvironmentalServices, + ResearchServices: ResearchServices, + ThinkTanks: ThinkTanks, + Photography: Photography, + TranslationAndLocalization: TranslationAndLocalization, + WritingAndEditing: WritingAndEditing, + + // Education + Education: Education, + HigherEducation: HigherEducation, + ProfessionalTrainingAndCoaching: ProfessionalTrainingAndCoaching, + SportsAndRecreationInstruction: SportsAndRecreationInstruction, + + // Transportation & Logistics + TransportationLogisticsSupplyChainAndStorage: TransportationLogisticsSupplyChainAndStorage, + AirlinesAndAviation: AirlinesAndAviation, + FreightAndPackageTransportation: FreightAndPackageTransportation, + MaritimeTransportation: MaritimeTransportation, + RailTransportation: RailTransportation, + TruckTransportation: TruckTransportation, + WarehousingAndStorage: WarehousingAndStorage, + PostalServices: PostalServices, + + // Energy & Utilities + Utilities: Utilities, + ElectricPowerGeneration: ElectricPowerGeneration, + RenewableEnergyPowerGeneration: RenewableEnergyPowerGeneration, + OilAndGas: OilAndGas, + Mining: Mining, + OilGasAndMining: OilGasAndMining, + + // Media & Entertainment + TechnologyInformationAndMedia: TechnologyInformationAndMedia, + BroadcastMediaProductionAndDistribution: BroadcastMediaProductionAndDistribution, + RadioAndTelevisionBroadcasting: RadioAndTelevisionBroadcasting, + MoviesVideosAndSound: MoviesVideosAndSound, + MediaProduction: MediaProduction, + SoundRecording: SoundRecording, + BookAndPeriodicalPublishing: BookAndPeriodicalPublishing, + NewspaperPublishing: NewspaperPublishing, + PeriodicalPublishing: PeriodicalPublishing, + EntertainmentProviders: EntertainmentProviders, + ArtistsAndWriters: ArtistsAndWriters, + Musicians: Musicians, + + // Construction & Real Estate + Construction: Construction, + CivilEngineering: CivilEngineering, + RealEstate: RealEstate, + RealEstateAgentsAndBrokers: RealEstateAgentsAndBrokers, + + // Hospitality & Services + Hospitality: Hospitality, + HotelsAndMotels: HotelsAndMotels, + Restaurants: Restaurants, + FoodAndBeverageServices: FoodAndBeverageServices, + TravelArrangements: TravelArrangements, + EventsServices: EventsServices, + WellnessAndFitnessServices: WellnessAndFitnessServices, + ConsumerServices: ConsumerServices, + + // Government & Non-Profit + ArmedForces: ArmedForces, + GovernmentRelationsServices: GovernmentRelationsServices, + NonProfitOrganizations: NonProfitOrganizations, + CivicAndSocialOrganizations: CivicAndSocialOrganizations, + PoliticalOrganizations: PoliticalOrganizations, + ProfessionalOrganizations: ProfessionalOrganizations, + Fundraising: Fundraising, + + // Wholesale & Distribution + Wholesale: Wholesale, + WholesaleImportAndExport: WholesaleImportAndExport, + WholesaleComputerEquipment: WholesaleComputerEquipment, + WholesaleFoodAndBeverage: WholesaleFoodAndBeverage, + WholesaleBuildingMaterials: WholesaleBuildingMaterials, + WholesaleMachinery: WholesaleMachinery, + WholesaleMotorVehiclesAndParts: WholesaleMotorVehiclesAndParts, + + // Other Services + StaffingAndRecruiting: StaffingAndRecruiting, + ExecutiveSearchServices: ExecutiveSearchServices, + OfficeAdministration: OfficeAdministration, + SecurityAndInvestigations: SecurityAndInvestigations, + EquipmentRentalServices: EquipmentRentalServices, + Libraries: Libraries, +} diff --git a/types/linkedin/linkedin.go b/types/linkedin/linkedin.go new file mode 100644 index 0000000..80e0ff8 --- /dev/null +++ b/types/linkedin/linkedin.go @@ -0,0 +1,25 @@ +package linkedin + +import ( + "github.com/masa-finance/tee-types/types/linkedin/experiences" + "github.com/masa-finance/tee-types/types/linkedin/functions" + "github.com/masa-finance/tee-types/types/linkedin/industries" + "github.com/masa-finance/tee-types/types/linkedin/profile" + "github.com/masa-finance/tee-types/types/linkedin/seniorities" +) + +type LinkedInConfig struct { + Experiences *experiences.ExperiencesConfig + Seniorities *seniorities.SenioritiesConfig + Functions *functions.FunctionsConfig + Industries *industries.IndustriesConfig +} + +var LinkedIn = LinkedInConfig{ + Experiences: &experiences.Experiences, + Seniorities: &seniorities.Seniorities, + Functions: &functions.Functions, + Industries: &industries.Industries, +} + +type Profile = *profile.Profile diff --git a/types/linkedin/linkedin_suite_test.go b/types/linkedin/linkedin_suite_test.go new file mode 100644 index 0000000..8f46e3d --- /dev/null +++ b/types/linkedin/linkedin_suite_test.go @@ -0,0 +1,13 @@ +package linkedin_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestTypes(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Types Suite") +} diff --git a/types/linkedin/linkedin_test.go b/types/linkedin/linkedin_test.go new file mode 100644 index 0000000..3ce35ae --- /dev/null +++ b/types/linkedin/linkedin_test.go @@ -0,0 +1,135 @@ +package linkedin_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-types/types/linkedin/experiences" + "github.com/masa-finance/tee-types/types/linkedin/functions" + "github.com/masa-finance/tee-types/types/linkedin/industries" + "github.com/masa-finance/tee-types/types/linkedin/seniorities" +) + +var _ = Describe("LinkedIn Types", func() { + Describe("LinkedIn Package", func() { + It("should have all required fields", func() { + linkedin := types.LinkedIn + + Expect(linkedin.Seniorities).ToNot(BeNil()) + Expect(linkedin.Experiences).ToNot(BeNil()) + Expect(linkedin.Functions).ToNot(BeNil()) + Expect(linkedin.Industries).ToNot(BeNil()) + }) + }) + + Describe("Seniorities", func() { + It("should have all seniority levels", func() { + s := types.LinkedIn.Seniorities + + Expect(s.InTraining).To(Equal(seniorities.InTraining)) + Expect(s.EntryLevel).To(Equal(seniorities.EntryLevel)) + Expect(s.Senior).To(Equal(seniorities.Senior)) + Expect(s.Strategic).To(Equal(seniorities.Strategic)) + Expect(s.EntryLevelManager).To(Equal(seniorities.EntryLevelManager)) + Expect(s.ExperiencedManager).To(Equal(seniorities.ExperiencedManager)) + Expect(s.Director).To(Equal(seniorities.Director)) + Expect(s.VicePresident).To(Equal(seniorities.VicePresident)) + Expect(s.CXO).To(Equal(seniorities.CXO)) + Expect(s.Partner).To(Equal(seniorities.Partner)) + }) + + It("should have All set containing all seniorities", func() { + all := types.LinkedIn.Seniorities.All + + Expect(all.Contains(seniorities.InTraining)).To(BeTrue()) + Expect(all.Contains(seniorities.EntryLevel)).To(BeTrue()) + Expect(all.Contains(seniorities.Senior)).To(BeTrue()) + Expect(all.Contains(seniorities.Strategic)).To(BeTrue()) + Expect(all.Contains(seniorities.EntryLevelManager)).To(BeTrue()) + Expect(all.Contains(seniorities.ExperiencedManager)).To(BeTrue()) + Expect(all.Contains(seniorities.Director)).To(BeTrue()) + Expect(all.Contains(seniorities.VicePresident)).To(BeTrue()) + Expect(all.Contains(seniorities.CXO)).To(BeTrue()) + Expect(all.Contains(seniorities.Partner)).To(BeTrue()) + + Expect(all.Length()).To(Equal(10)) + }) + }) + + Describe("Experiences", func() { + It("should have all experience levels", func() { + e := types.LinkedIn.Experiences + + Expect(e.LessThanAYear).To(Equal(experiences.LessThanAYear)) + Expect(e.OneToTwoYears).To(Equal(experiences.OneToTwoYears)) + Expect(e.ThreeToFiveYears).To(Equal(experiences.ThreeToFiveYears)) + Expect(e.SixToTenYears).To(Equal(experiences.SixToTenYears)) + Expect(e.MoreThanTenYears).To(Equal(experiences.MoreThanTenYears)) + }) + + It("should have All set containing all experiences", func() { + all := types.LinkedIn.Experiences.All + + Expect(all.Contains(experiences.LessThanAYear)).To(BeTrue()) + Expect(all.Contains(experiences.OneToTwoYears)).To(BeTrue()) + Expect(all.Contains(experiences.ThreeToFiveYears)).To(BeTrue()) + Expect(all.Contains(experiences.SixToTenYears)).To(BeTrue()) + Expect(all.Contains(experiences.MoreThanTenYears)).To(BeTrue()) + + Expect(all.Length()).To(Equal(5)) + }) + }) + + Describe("Functions", func() { + It("should have all function types", func() { + f := types.LinkedIn.Functions + + Expect(f.Accounting).To(Equal(functions.Accounting)) + Expect(f.Engineering).To(Equal(functions.Engineering)) + Expect(f.Marketing).To(Equal(functions.Marketing)) + Expect(f.Sales).To(Equal(functions.Sales)) + Expect(f.HumanResources).To(Equal(functions.HumanResources)) + }) + + It("should have All set containing all functions", func() { + all := types.LinkedIn.Functions.All + + Expect(all.Contains(functions.Accounting)).To(BeTrue()) + Expect(all.Contains(functions.Engineering)).To(BeTrue()) + Expect(all.Contains(functions.Marketing)).To(BeTrue()) + Expect(all.Contains(functions.Sales)).To(BeTrue()) + Expect(all.Contains(functions.HumanResources)).To(BeTrue()) + Expect(all.Contains(functions.InformationTechnology)).To(BeTrue()) + Expect(all.Contains(functions.Finance)).To(BeTrue()) + + Expect(all.Length()).To(Equal(25)) + }) + }) + + Describe("Industries", func() { + It("should have all industry types", func() { + i := types.LinkedIn.Industries + + Expect(i.SoftwareDevelopment).To(Equal(industries.SoftwareDevelopment)) + Expect(i.FinancialServices).To(Equal(industries.FinancialServices)) + Expect(i.Manufacturing).To(Equal(industries.Manufacturing)) + Expect(i.Retail).To(Equal(industries.Retail)) + Expect(i.Education).To(Equal(industries.Education)) + }) + + It("should have All set containing all industries", func() { + all := types.LinkedIn.Industries.All + + Expect(all.Contains(industries.SoftwareDevelopment)).To(BeTrue()) + Expect(all.Contains(industries.FinancialServices)).To(BeTrue()) + Expect(all.Contains(industries.Manufacturing)).To(BeTrue()) + Expect(all.Contains(industries.Retail)).To(BeTrue()) + Expect(all.Contains(industries.Education)).To(BeTrue()) + Expect(all.Contains(industries.Hospitals)).To(BeTrue()) + Expect(all.Contains(industries.ProfessionalServices)).To(BeTrue()) + + Expect(all.Length()).To(BeNumerically(">=", 100)) // Should have many industries + }) + }) +}) diff --git a/types/linkedin/profile/profile.go b/types/linkedin/profile/profile.go new file mode 100644 index 0000000..e74439e --- /dev/null +++ b/types/linkedin/profile/profile.go @@ -0,0 +1,261 @@ +package profile + +import ( + "time" + + "github.com/masa-finance/tee-types/pkg/util" +) + +type ScraperMode string + +const ( + ScraperModeShort ScraperMode = "Short" + ScraperModeFull ScraperMode = "Full" + ScraperModeFullEmail ScraperMode = "Full + email search" +) + +var AllScraperModes = util.NewSet(ScraperModeShort, ScraperModeFull, ScraperModeFullEmail) + +// Profile represents a complete profile response +type Profile struct { + ID string `json:"id"` + PublicIdentifier string `json:"publicIdentifier,omitempty"` + URL string `json:"linkedinUrl"` + FirstName string `json:"firstName"` + LastName string `json:"lastName"` + Headline *string `json:"headline,omitempty"` + About *string `json:"about,omitempty"` + Summary *string `json:"summary,omitempty"` + OpenToWork bool `json:"openToWork,omitempty"` + OpenProfile bool `json:"openProfile,omitempty"` + Hiring bool `json:"hiring,omitempty"` + Photo *string `json:"photo,omitempty"` + PictureUrl *string `json:"pictureUrl,omitempty"` + Premium bool `json:"premium,omitempty"` + Influencer bool `json:"influencer,omitempty"` + Location Location `json:"location,omitempty"` + Verified bool `json:"verified,omitempty"` + RegisteredAt time.Time `json:"registeredAt,omitempty"` + TopSkills *string `json:"topSkills,omitempty"` + ConnectionsCount int `json:"connectionsCount,omitempty"` + FollowerCount int `json:"followerCount,omitempty"` + ComposeOptionType *string `json:"composeOptionType,omitempty"` + + // Full mode + CurrentPosition []CurrentPosition `json:"currentPosition,omitempty"` + + // Short mode + CurrentPositions []ShortCurrentPosition `json:"currentPositions,omitempty"` + + Experience []Experience `json:"experience,omitempty"` + Education []Education `json:"education,omitempty"` + Certifications []Certification `json:"certifications,omitempty"` + Projects []Project `json:"projects,omitempty"` + Volunteering []Volunteering `json:"volunteering,omitempty"` + ReceivedRecommendations []Recommendation `json:"receivedRecommendations,omitempty"` + Skills []Skill `json:"skills,omitempty"` + Courses []Course `json:"courses,omitempty"` + Publications []Publication `json:"publications,omitempty"` + Patents []Patent `json:"patents,omitempty"` + HonorsAndAwards []HonorAndAward `json:"honorsAndAwards,omitempty"` + Languages []Language `json:"languages,omitempty"` + Featured any `json:"featured,omitempty"` + MoreProfiles []MoreProfile `json:"moreProfiles,omitempty"` + + // Email mode + Emails []string `json:"emails,omitempty"` + CompanyWebsites []CompanyWebsite `json:"companyWebsites,omitempty"` +} + +// Location represents the location information +type Location struct { + Text string `json:"linkedinText"` + CountryCode string `json:"countryCode,omitempty"` + Parsed ParsedLocation `json:"parsed,omitempty"` +} + +// ParsedLocation represents the parsed location details +type ParsedLocation struct { + Text string `json:"text,omitempty"` + CountryCode string `json:"countryCode,omitempty"` + RegionCode *string `json:"regionCode,omitempty"` + Country string `json:"country,omitempty"` + CountryFull string `json:"countryFull,omitempty"` + State string `json:"state,omitempty"` + City string `json:"city,omitempty"` +} + +// CurrentPosition represents current position information +type CurrentPosition struct { + CompanyID *string `json:"companyId,omitempty"` + CompanyLinkedinUrl *string `json:"companyLinkedinUrl,omitempty"` + CompanyName string `json:"companyName"` + DateRange *DatePeriod `json:"dateRange,omitempty"` +} + +// Experience represents work experience +type Experience struct { + Position string `json:"position"` + Location *string `json:"location,omitempty"` + EmploymentType *string `json:"employmentType,omitempty"` + WorkplaceType *string `json:"workplaceType,omitempty"` + CompanyName string `json:"companyName"` + CompanyURL *string `json:"companyUrl,omitempty"` + CompanyID *string `json:"companyId,omitempty"` + CompanyUniversalName *string `json:"companyUniversalName,omitempty"` + Duration string `json:"duration"` + Description *string `json:"description,omitempty"` + Skills []string `json:"skills,omitempty"` + StartDate DateRange `json:"startDate"` + EndDate DateRange `json:"endDate"` +} + +// DateRange represents a date range with month, year, and text +type DateRange struct { + Month *string `json:"month,omitempty"` + Year *int `json:"year,omitempty"` + Text string `json:"text"` +} + +// Education represents educational background +type Education struct { + SchoolName string `json:"schoolName,omitempty"` + SchoolURL string `json:"schoolUrl,omitempty"` + Degree string `json:"degree,omitempty"` + FieldOfStudy *string `json:"fieldOfStudy,omitempty"` + Skills []string `json:"skills,omitempty"` + StartDate DateRange `json:"startDate,omitempty"` + EndDate DateRange `json:"endDate,omitempty"` + Period string `json:"period,omitempty"` +} + +// Certification represents a certification +type Certification struct { + Title string `json:"title,omitempty"` + IssuedAt string `json:"issuedAt,omitempty"` + IssuedBy string `json:"issuedBy,omitempty"` + IssuedByLink string `json:"issuedByLink,omitempty"` +} + +// Project represents a project +type Project struct { + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + Duration string `json:"duration,omitempty"` + StartDate DateRange `json:"startDate,omitempty"` + EndDate DateRange `json:"endDate,omitempty"` +} + +// Volunteering represents volunteer experience +type Volunteering struct { + Role string `json:"role,omitempty"` + Duration string `json:"duration,omitempty"` + StartDate *DateRange `json:"startDate,omitempty"` + EndDate *DateRange `json:"endDate,omitempty"` + OrganizationName string `json:"organizationName,omitempty"` + OrganizationURL *string `json:"organizationUrl,omitempty"` + Cause string `json:"cause,omitempty"` +} + +// Skill represents a skill with optional positions and endorsements +type Skill struct { + Name string `json:"name,omitempty"` + Positions []string `json:"positions,omitempty"` + Endorsements string `json:"endorsements,omitempty"` +} + +// Course represents a course +type Course struct { + Title string `json:"title,omitempty"` + AssociatedWith string `json:"associatedWith,omitempty"` + AssociatedWithLink string `json:"associatedWithLink,omitempty"` +} + +// Publication represents a publication +type Publication struct { + Title string `json:"title,omitempty"` + PublishedAt string `json:"publishedAt,omitempty"` + Link string `json:"link,omitempty"` +} + +// HonorAndAward represents an honor or award +type HonorAndAward struct { + Title string `json:"title,omitempty"` + IssuedBy string `json:"issuedBy,omitempty"` + IssuedAt string `json:"issuedAt,omitempty"` + Description string `json:"description,omitempty"` + AssociatedWith string `json:"associatedWith,omitempty"` + AssociatedWithLink string `json:"associatedWithLink,omitempty"` +} + +// Language represents a language with proficiency level +type Language struct { + Name string `json:"name,omitempty"` + Proficiency string `json:"proficiency,omitempty"` +} + +// MoreProfile represents a related profile +type MoreProfile struct { + ID string `json:"id,omitempty"` + FirstName string `json:"firstName,omitempty"` + LastName string `json:"lastName,omitempty"` + Position *string `json:"position,omitempty"` + PublicIdentifier string `json:"publicIdentifier,omitempty"` + URL string `json:"linkedinUrl,omitempty"` +} + +// ShortCurrentPosition represents the short profile current positions array +type ShortCurrentPosition struct { + TenureAtPosition *Tenure `json:"tenureAtPosition,omitempty"` + CompanyName string `json:"companyName,omitempty"` + Title *string `json:"title,omitempty"` + Current *bool `json:"current,omitempty"` + TenureAtCompany *Tenure `json:"tenureAtCompany,omitempty"` + StartedOn *StartedOn `json:"startedOn,omitempty"` + CompanyID *string `json:"companyId,omitempty"` + CompanyLinkedinUrl *string `json:"companyLinkedinUrl,omitempty"` +} + +type Tenure struct { + NumYears *int `json:"numYears,omitempty"` + NumMonths *int `json:"numMonths,omitempty"` +} + +type StartedOn struct { + Month int `json:"month,omitempty"` + Year int `json:"year,omitempty"` +} + +// DatePeriod represents a date period with optional start and end parts +type DatePeriod struct { + Start *DateParts `json:"start,omitempty"` + End *DateParts `json:"end,omitempty"` +} + +type DateParts struct { + Month *int `json:"month,omitempty"` + Year *int `json:"year,omitempty"` + Day *int `json:"day,omitempty"` +} + +// CompanyWebsite represents company website with validation hint +type CompanyWebsite struct { + URL string `json:"url,omitempty"` + Domain string `json:"domain,omitempty"` + ValidEmailServer *bool `json:"validEmailServer,omitempty"` +} + +// Recommendation captures received recommendations +type Recommendation struct { + GivenBy *string `json:"givenBy,omitempty"` + GivenByLink *string `json:"givenByLink,omitempty"` + GivenAt *string `json:"givenAt,omitempty"` + Description string `json:"description,omitempty"` +} + +// Patent represents a patent entry +type Patent struct { + Title string `json:"title,omitempty"` + Number *string `json:"number,omitempty"` + IssuedAt string `json:"issuedAt,omitempty"` +} diff --git a/types/linkedin/seniorities/seniorities.go b/types/linkedin/seniorities/seniorities.go new file mode 100644 index 0000000..98e978c --- /dev/null +++ b/types/linkedin/seniorities/seniorities.go @@ -0,0 +1,61 @@ +package seniorities + +import "github.com/masa-finance/tee-types/pkg/util" + +// id represents a LinkedIn seniority level identifier +type Id string + +// Seniority level constants +const ( + InTraining Id = "100" + EntryLevel Id = "110" + Senior Id = "120" + Strategic Id = "130" + EntryLevelManager Id = "200" + ExperiencedManager Id = "210" + Director Id = "220" + VicePresident Id = "300" + CXO Id = "310" + Partner Id = "320" +) + +var All = util.NewSet( + InTraining, + EntryLevel, + Senior, + Strategic, + EntryLevelManager, + ExperiencedManager, + Director, + VicePresident, + CXO, + Partner, +) + +type SenioritiesConfig struct { + All util.Set[Id] + InTraining Id + EntryLevel Id + Senior Id + Strategic Id + EntryLevelManager Id + ExperiencedManager Id + Director Id + VicePresident Id + CXO Id + Partner Id +} + +var Seniorities = SenioritiesConfig{ + All: *All, + InTraining: InTraining, + EntryLevel: EntryLevel, + Senior: Senior, + Strategic: Strategic, + EntryLevelManager: EntryLevelManager, + ExperiencedManager: ExperiencedManager, + Director: Director, + VicePresident: VicePresident, + CXO: CXO, + Partner: Partner, +} diff --git a/types/types.go b/types/types.go new file mode 100644 index 0000000..64b43d0 --- /dev/null +++ b/types/types.go @@ -0,0 +1,7 @@ +package types + +import ( + linkedin "github.com/masa-finance/tee-types/types/linkedin" +) + +var LinkedIn = linkedin.LinkedIn From 4421902362ffcc884d83fce2b442ddc0402668c9 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 20:40:01 +0200 Subject: [PATCH 107/136] Add collaboration rules before merge --- .cursor/rules/collaboration-rules.mdc | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .cursor/rules/collaboration-rules.mdc diff --git a/.cursor/rules/collaboration-rules.mdc b/.cursor/rules/collaboration-rules.mdc new file mode 100644 index 0000000..fbc46e1 --- /dev/null +++ b/.cursor/rules/collaboration-rules.mdc @@ -0,0 +1,30 @@ +--- +description: +globs: +alwaysApply: true +--- +# Collaboration Rules + +## Planning and Confirmation Rule + +**Before implementing any code changes, features, or modifications:** + +1. **Create a Plan**: Always develop a clear, detailed plan that outlines: + - What changes will be made + - Which files will be modified or created + - The approach and methodology + - Expected outcomes and impacts + +2. **Confirm with User**: Present the plan to the user and wait for explicit confirmation before: + - Making any file modifications + - Creating new files + - Running commands that modify the codebase + - Implementing any suggested changes + +3. **Get Approval**: Only proceed with implementation after receiving clear approval from the user. + +4. **No Assumptions**: Never assume the user wants changes implemented immediately, even if they seem obvious or beneficial. + +**Exception**: Read-only operations (viewing files, searching, analyzing) do not require prior confirmation. + +This rule ensures we maintain collaborative control over the codebase and prevents unwanted changes. From 39baf851401a88e090c98ee055f361280ec6b7f8 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 20:48:23 +0200 Subject: [PATCH 108/136] Complete tee-types merge - Successfully merged tee-types into tee-worker with preserved git history - Moved args/ to api/args/ and types/ to api/types/ - Updated all import paths from external tee-types to local paths - Removed external tee-types dependency from go.mod - Removed tee-worker specific files (job.go, encrypted.go, key.go) from types package - All merged packages (api/types, api/args, pkg/util) build successfully - Ready for CI integration and final testing --- {args => api/args}/args.go | 4 +- {args => api/args}/args_suite_test.go | 0 {args => api/args}/linkedin/linkedin.go | 2 +- .../args}/linkedin/profile/profile.go | 12 +- .../linkedin/profile/profile_suite_test.go | 0 .../args}/linkedin/profile/profile_test.go | 16 +- {args => api/args}/llm.go | 4 +- {args => api/args}/llm_test.go | 2 +- {args => api/args}/reddit.go | 2 +- {args => api/args}/reddit_test.go | 4 +- {args => api/args}/telemetry.go | 2 +- {args => api/args}/tiktok.go | 2 +- {args => api/args}/twitter.go | 2 +- {args => api/args}/unmarshaller.go | 2 +- {args => api/args}/unmarshaller_test.go | 4 +- {args => api/args}/web.go | 2 +- {args => api/args}/web_test.go | 4 +- api/types/encrypted.go | 36 ----- api/types/job.go | 113 ------------- {types => api/types}/jobs.go | 2 +- api/types/key.go | 11 -- .../linkedin/experiences/experiences.go | 2 +- .../types}/linkedin/functions/functions.go | 2 +- .../types}/linkedin/industries/industries.go | 2 +- {types => api/types}/linkedin/linkedin.go | 10 +- .../types}/linkedin/linkedin_suite_test.go | 0 .../types}/linkedin/linkedin_test.go | 10 +- .../types}/linkedin/profile/profile.go | 2 +- .../linkedin/seniorities/seniorities.go | 2 +- {types => api/types}/llm.go | 0 {types => api/types}/reddit.go | 2 +- api/types/reddit/reddit.go | 152 ------------------ api/types/reddit/reddit_suite_test.go | 13 -- api/types/reddit/reddit_test.go | 150 ----------------- {types => api/types}/reddit_test.go | 2 +- {types => api/types}/tiktok.go | 0 {types => api/types}/twitter.go | 0 api/types/types.go | 7 + {types => api/types}/types_suite_test.go | 0 {types => api/types}/web.go | 0 go.sum | 2 - internal/api/api_test.go | 2 +- internal/apify/actors.go | 2 +- internal/capabilities/detector.go | 4 +- internal/capabilities/detector_test.go | 2 +- internal/config/config.go | 2 +- internal/jobs/linkedin.go | 8 +- internal/jobs/linkedin_test.go | 6 +- internal/jobs/linkedinapify/client.go | 4 +- internal/jobs/linkedinapify/client_test.go | 6 +- internal/jobs/llmapify/client.go | 4 +- internal/jobs/llmapify/client_test.go | 4 +- internal/jobs/reddit.go | 6 +- internal/jobs/reddit_test.go | 4 +- internal/jobs/redditapify/client.go | 6 +- internal/jobs/redditapify/client_test.go | 4 +- internal/jobs/stats/stats.go | 2 +- internal/jobs/telemetry.go | 2 +- internal/jobs/telemetry_test.go | 2 +- internal/jobs/tiktok.go | 4 +- internal/jobs/tiktok_test.go | 2 +- internal/jobs/tiktokapify/client.go | 4 +- internal/jobs/twitter.go | 4 +- internal/jobs/twitter_test.go | 2 +- internal/jobs/twitterapify/client.go | 4 +- internal/jobs/web.go | 6 +- internal/jobs/web_test.go | 4 +- internal/jobs/webapify/client.go | 4 +- internal/jobs/webapify/client_test.go | 2 +- internal/jobserver/jobserver.go | 2 +- internal/jobserver/jobserver_test.go | 2 +- internal/jobserver/worker.go | 2 +- pkg/client/http.go | 2 +- pkg/client/http_test.go | 2 +- pkg/util/math_test.go | 2 +- pkg/util/set_test.go | 2 +- types/types.go | 7 - 77 files changed, 117 insertions(+), 594 deletions(-) rename {args => api/args}/args.go (87%) rename {args => api/args}/args_suite_test.go (100%) rename {args => api/args}/linkedin/linkedin.go (52%) rename {args => api/args}/linkedin/profile/profile.go (91%) rename {args => api/args}/linkedin/profile/profile_suite_test.go (100%) rename {args => api/args}/linkedin/profile/profile_test.go (93%) rename {args => api/args}/llm.go (96%) rename {args => api/args}/llm_test.go (98%) rename {args => api/args}/reddit.go (98%) rename {args => api/args}/reddit_test.go (98%) rename {args => api/args}/telemetry.go (85%) rename {args => api/args}/tiktok.go (99%) rename {args => api/args}/twitter.go (98%) rename {args => api/args}/unmarshaller.go (99%) rename {args => api/args}/unmarshaller_test.go (97%) rename {args => api/args}/web.go (98%) rename {args => api/args}/web_test.go (97%) delete mode 100644 api/types/encrypted.go delete mode 100644 api/types/job.go rename {types => api/types}/jobs.go (99%) delete mode 100644 api/types/key.go rename {types => api/types}/linkedin/experiences/experiences.go (93%) rename {types => api/types}/linkedin/functions/functions.go (98%) rename {types => api/types}/linkedin/industries/industries.go (99%) rename {types => api/types}/linkedin/linkedin.go (57%) rename {types => api/types}/linkedin/linkedin_suite_test.go (100%) rename {types => api/types}/linkedin/linkedin_test.go (93%) rename {types => api/types}/linkedin/profile/profile.go (99%) rename {types => api/types}/linkedin/seniorities/seniorities.go (96%) rename {types => api/types}/llm.go (100%) rename {types => api/types}/reddit.go (99%) delete mode 100644 api/types/reddit/reddit.go delete mode 100644 api/types/reddit/reddit_suite_test.go delete mode 100644 api/types/reddit/reddit_test.go rename {types => api/types}/reddit_test.go (98%) rename {types => api/types}/tiktok.go (100%) rename {types => api/types}/twitter.go (100%) create mode 100644 api/types/types.go rename {types => api/types}/types_suite_test.go (100%) rename {types => api/types}/web.go (100%) delete mode 100644 types/types.go diff --git a/args/args.go b/api/args/args.go similarity index 87% rename from args/args.go rename to api/args/args.go index f090037..71b2b31 100644 --- a/args/args.go +++ b/api/args/args.go @@ -4,8 +4,8 @@ import ( "encoding/json" "fmt" - "github.com/masa-finance/tee-types/args/linkedin" - teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/args/linkedin" + teetypes "github.com/masa-finance/tee-worker/api/types" ) type LinkedInProfileArguments = linkedin.ProfileArguments diff --git a/args/args_suite_test.go b/api/args/args_suite_test.go similarity index 100% rename from args/args_suite_test.go rename to api/args/args_suite_test.go diff --git a/args/linkedin/linkedin.go b/api/args/linkedin/linkedin.go similarity index 52% rename from args/linkedin/linkedin.go rename to api/args/linkedin/linkedin.go index 92ed3e7..e3ad22c 100644 --- a/args/linkedin/linkedin.go +++ b/api/args/linkedin/linkedin.go @@ -1,7 +1,7 @@ package linkedin import ( - "github.com/masa-finance/tee-types/args/linkedin/profile" + "github.com/masa-finance/tee-worker/api/args/linkedin/profile" ) type ProfileArguments = profile.Arguments diff --git a/args/linkedin/profile/profile.go b/api/args/linkedin/profile/profile.go similarity index 91% rename from args/linkedin/profile/profile.go rename to api/args/linkedin/profile/profile.go index 399caa0..9409404 100644 --- a/args/linkedin/profile/profile.go +++ b/api/args/linkedin/profile/profile.go @@ -5,12 +5,12 @@ import ( "errors" "fmt" - teetypes "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-types/types/linkedin/experiences" - "github.com/masa-finance/tee-types/types/linkedin/functions" - "github.com/masa-finance/tee-types/types/linkedin/industries" - "github.com/masa-finance/tee-types/types/linkedin/profile" - "github.com/masa-finance/tee-types/types/linkedin/seniorities" + teetypes "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/types/linkedin/experiences" + "github.com/masa-finance/tee-worker/api/types/linkedin/functions" + "github.com/masa-finance/tee-worker/api/types/linkedin/industries" + "github.com/masa-finance/tee-worker/api/types/linkedin/profile" + "github.com/masa-finance/tee-worker/api/types/linkedin/seniorities" ) var ( diff --git a/args/linkedin/profile/profile_suite_test.go b/api/args/linkedin/profile/profile_suite_test.go similarity index 100% rename from args/linkedin/profile/profile_suite_test.go rename to api/args/linkedin/profile/profile_suite_test.go diff --git a/args/linkedin/profile/profile_test.go b/api/args/linkedin/profile/profile_test.go similarity index 93% rename from args/linkedin/profile/profile_test.go rename to api/args/linkedin/profile/profile_test.go index 6c9e5de..5947989 100644 --- a/args/linkedin/profile/profile_test.go +++ b/api/args/linkedin/profile/profile_test.go @@ -7,14 +7,14 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/args" - "github.com/masa-finance/tee-types/args/linkedin/profile" - "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-types/types/linkedin/experiences" - "github.com/masa-finance/tee-types/types/linkedin/functions" - "github.com/masa-finance/tee-types/types/linkedin/industries" - profiletypes "github.com/masa-finance/tee-types/types/linkedin/profile" - "github.com/masa-finance/tee-types/types/linkedin/seniorities" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/linkedin/profile" + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/types/linkedin/experiences" + "github.com/masa-finance/tee-worker/api/types/linkedin/functions" + "github.com/masa-finance/tee-worker/api/types/linkedin/industries" + profiletypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" + "github.com/masa-finance/tee-worker/api/types/linkedin/seniorities" ) var _ = Describe("LinkedIn Profile Arguments", func() { diff --git a/args/llm.go b/api/args/llm.go similarity index 96% rename from args/llm.go rename to api/args/llm.go index d3e4ac8..eaf02ad 100644 --- a/args/llm.go +++ b/api/args/llm.go @@ -6,8 +6,8 @@ import ( "fmt" "strconv" - "github.com/masa-finance/tee-types/pkg/util" - teetypes "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/pkg/util" + teetypes "github.com/masa-finance/tee-worker/api/types" ) var ( diff --git a/args/llm_test.go b/api/args/llm_test.go similarity index 98% rename from args/llm_test.go rename to api/args/llm_test.go index a9b02c2..d15e048 100644 --- a/args/llm_test.go +++ b/api/args/llm_test.go @@ -7,7 +7,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/args" + "github.com/masa-finance/tee-worker/api/args" ) var _ = Describe("LLMProcessorArguments", func() { diff --git a/args/reddit.go b/api/args/reddit.go similarity index 98% rename from args/reddit.go rename to api/args/reddit.go index c2ac4e5..cd024f4 100644 --- a/args/reddit.go +++ b/api/args/reddit.go @@ -8,7 +8,7 @@ import ( "strings" "time" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" ) var ( diff --git a/args/reddit_test.go b/api/args/reddit_test.go similarity index 98% rename from args/reddit_test.go rename to api/args/reddit_test.go index 251ff67..f9775cc 100644 --- a/args/reddit_test.go +++ b/api/args/reddit_test.go @@ -7,8 +7,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/args" - "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" ) var _ = Describe("RedditArguments", func() { diff --git a/args/telemetry.go b/api/args/telemetry.go similarity index 85% rename from args/telemetry.go rename to api/args/telemetry.go index 08fb741..b947204 100644 --- a/args/telemetry.go +++ b/api/args/telemetry.go @@ -1,7 +1,7 @@ package args import ( - "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/types" ) // TelemetryJobArguments for telemetry jobs (simple case) diff --git a/args/tiktok.go b/api/args/tiktok.go similarity index 99% rename from args/tiktok.go rename to api/args/tiktok.go index 0e015c5..37c4aaa 100644 --- a/args/tiktok.go +++ b/api/args/tiktok.go @@ -7,7 +7,7 @@ import ( "net/url" "strings" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // Period constants for TikTok trending search diff --git a/args/twitter.go b/api/args/twitter.go similarity index 98% rename from args/twitter.go rename to api/args/twitter.go index 6c08f65..467df1b 100644 --- a/args/twitter.go +++ b/api/args/twitter.go @@ -5,7 +5,7 @@ import ( "errors" "fmt" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" ) var ( diff --git a/args/unmarshaller.go b/api/args/unmarshaller.go similarity index 99% rename from args/unmarshaller.go rename to api/args/unmarshaller.go index 9057885..737826e 100644 --- a/args/unmarshaller.go +++ b/api/args/unmarshaller.go @@ -4,7 +4,7 @@ import ( "encoding/json" "fmt" - "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/types" ) // JobArguments defines the interface that all job arguments must implement diff --git a/args/unmarshaller_test.go b/api/args/unmarshaller_test.go similarity index 97% rename from args/unmarshaller_test.go rename to api/args/unmarshaller_test.go index 03baaca..d9d168f 100644 --- a/args/unmarshaller_test.go +++ b/api/args/unmarshaller_test.go @@ -4,8 +4,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/args" - "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" ) var _ = Describe("Unmarshaller", func() { diff --git a/args/web.go b/api/args/web.go similarity index 98% rename from args/web.go rename to api/args/web.go index 561aa59..dac4642 100644 --- a/args/web.go +++ b/api/args/web.go @@ -6,7 +6,7 @@ import ( "fmt" "net/url" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" ) var ( diff --git a/args/web_test.go b/api/args/web_test.go similarity index 97% rename from args/web_test.go rename to api/args/web_test.go index 77e771f..fecf831 100644 --- a/args/web_test.go +++ b/api/args/web_test.go @@ -7,8 +7,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/args" - "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" ) var _ = Describe("WebArguments", func() { diff --git a/api/types/encrypted.go b/api/types/encrypted.go deleted file mode 100644 index 1d5f36a..0000000 --- a/api/types/encrypted.go +++ /dev/null @@ -1,36 +0,0 @@ -package types - -import ( - "encoding/json" - "fmt" - - "github.com/masa-finance/tee-worker/pkg/tee" -) - -type EncryptedRequest struct { - EncryptedResult string `json:"encrypted_result"` - EncryptedRequest string `json:"encrypted_request"` -} - -func (payload EncryptedRequest) Unseal() (string, error) { - jobRequest, err := tee.Unseal(payload.EncryptedRequest) - if err != nil { - return "", fmt.Errorf("error while unsealing the encrypted request: %w", err) - } - - job := Job{} - if err := json.Unmarshal(jobRequest, &job); err != nil { - return "", fmt.Errorf("error while unmarshalling the job request: %w", err) - } - - dat, err := tee.UnsealWithKey(job.Nonce, payload.EncryptedResult) - if err != nil { - return "", fmt.Errorf("error while unsealing the job result: %w", err) - } - - return string(dat), nil -} - -type JobError struct { - Error string `json:"error"` -} diff --git a/api/types/job.go b/api/types/job.go deleted file mode 100644 index 9010c3b..0000000 --- a/api/types/job.go +++ /dev/null @@ -1,113 +0,0 @@ -package types - -import ( - "crypto/sha256" - "encoding/json" - "fmt" - "time" - - teetypes "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-worker/pkg/tee" - "golang.org/x/exp/rand" -) - -type JobArguments map[string]interface{} - -func (ja JobArguments) Unmarshal(i interface{}) error { - dat, err := json.Marshal(ja) - if err != nil { - return err - } - return json.Unmarshal(dat, i) -} - -type Job struct { - Type teetypes.JobType `json:"type"` - Arguments JobArguments `json:"arguments"` - UUID string `json:"-"` - Nonce string `json:"quote"` - WorkerID string `json:"worker_id"` - TargetWorker string `json:"target_worker"` - Timeout time.Duration `json:"timeout"` -} - -func (j Job) String() string { - return fmt.Sprintf("UUID: %s Type: %s Arguments: %s", j.UUID, j.Type, j.Arguments) -} - -var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") - -func randStringRunes(n int) string { - b := make([]rune, n) - for i := range b { - // TODO: Move xcrypt from indexer to tee-types, and use RandomString here (although we'll need a different alpahbet) - b[i] = letterRunes[rand.Intn(len(letterRunes))] - } - return string(b) -} - -// GenerateJobSignature generates a signature for the job. -func (job *Job) GenerateJobSignature() (string, error) { - - dat, err := json.Marshal(job) - if err != nil { - return "", err - } - - checksum := sha256.New() - checksum.Write(dat) - - job.Nonce = fmt.Sprintf("%s-%s", string(checksum.Sum(nil)), randStringRunes(99)) - - dat, err = json.Marshal(job) - if err != nil { - return "", err - } - - return tee.Seal(dat) -} - -type JobResponse struct { - UID string `json:"uid"` -} - -type JobResult struct { - Error string `json:"error"` - Data []byte `json:"data"` - Job Job `json:"job"` - NextCursor string `json:"next_cursor"` -} - -// Success returns true if the job was successful. -func (jr JobResult) Success() bool { - return jr.Error == "" -} - -// Seal returns the sealed job result. -func (jr JobResult) Seal() (string, error) { - return tee.SealWithKey(jr.Job.Nonce, jr.Data) -} - -// Unmarshal unmarshals the job result data. -func (jr JobResult) Unmarshal(i interface{}) error { - return json.Unmarshal(jr.Data, i) -} - -type JobRequest struct { - EncryptedJob string `json:"encrypted_job"` -} - -// DecryptJob decrypts the job request. -func (jobRequest JobRequest) DecryptJob() (*Job, error) { - dat, err := tee.Unseal(jobRequest.EncryptedJob) - if err != nil { - return nil, err - } - - job := Job{} - if err := json.Unmarshal(dat, &job); err != nil { - return nil, err - } - - return &job, nil -} diff --git a/types/jobs.go b/api/types/jobs.go similarity index 99% rename from types/jobs.go rename to api/types/jobs.go index 628cb46..c35faae 100644 --- a/types/jobs.go +++ b/api/types/jobs.go @@ -6,7 +6,7 @@ import ( "slices" "time" - "github.com/masa-finance/tee-types/pkg/util" + "github.com/masa-finance/tee-worker/pkg/util" ) type JobType string diff --git a/api/types/key.go b/api/types/key.go deleted file mode 100644 index 8691eae..0000000 --- a/api/types/key.go +++ /dev/null @@ -1,11 +0,0 @@ -package types - -type Key struct { - Key string `json:"key"` - - Signature string `json:"signature"` -} - -type KeyResponse struct { - Status string `json:"status"` -} diff --git a/types/linkedin/experiences/experiences.go b/api/types/linkedin/experiences/experiences.go similarity index 93% rename from types/linkedin/experiences/experiences.go rename to api/types/linkedin/experiences/experiences.go index e714b58..b339230 100644 --- a/types/linkedin/experiences/experiences.go +++ b/api/types/linkedin/experiences/experiences.go @@ -1,6 +1,6 @@ package experiences -import "github.com/masa-finance/tee-types/pkg/util" +import "github.com/masa-finance/tee-worker/pkg/util" // id represents a LinkedIn experience level identifier type Id string diff --git a/types/linkedin/functions/functions.go b/api/types/linkedin/functions/functions.go similarity index 98% rename from types/linkedin/functions/functions.go rename to api/types/linkedin/functions/functions.go index cd08f33..7dd715c 100644 --- a/types/linkedin/functions/functions.go +++ b/api/types/linkedin/functions/functions.go @@ -1,6 +1,6 @@ package functions -import "github.com/masa-finance/tee-types/pkg/util" +import "github.com/masa-finance/tee-worker/pkg/util" // id represents a LinkedIn function identifier type Id string diff --git a/types/linkedin/industries/industries.go b/api/types/linkedin/industries/industries.go similarity index 99% rename from types/linkedin/industries/industries.go rename to api/types/linkedin/industries/industries.go index 772ad9d..666df4d 100644 --- a/types/linkedin/industries/industries.go +++ b/api/types/linkedin/industries/industries.go @@ -1,6 +1,6 @@ package industries -import "github.com/masa-finance/tee-types/pkg/util" +import "github.com/masa-finance/tee-worker/pkg/util" // Id represents a LinkedIn industry identifier type Id string diff --git a/types/linkedin/linkedin.go b/api/types/linkedin/linkedin.go similarity index 57% rename from types/linkedin/linkedin.go rename to api/types/linkedin/linkedin.go index 80e0ff8..4f1dbf5 100644 --- a/types/linkedin/linkedin.go +++ b/api/types/linkedin/linkedin.go @@ -1,11 +1,11 @@ package linkedin import ( - "github.com/masa-finance/tee-types/types/linkedin/experiences" - "github.com/masa-finance/tee-types/types/linkedin/functions" - "github.com/masa-finance/tee-types/types/linkedin/industries" - "github.com/masa-finance/tee-types/types/linkedin/profile" - "github.com/masa-finance/tee-types/types/linkedin/seniorities" + "github.com/masa-finance/tee-worker/api/types/linkedin/experiences" + "github.com/masa-finance/tee-worker/api/types/linkedin/functions" + "github.com/masa-finance/tee-worker/api/types/linkedin/industries" + "github.com/masa-finance/tee-worker/api/types/linkedin/profile" + "github.com/masa-finance/tee-worker/api/types/linkedin/seniorities" ) type LinkedInConfig struct { diff --git a/types/linkedin/linkedin_suite_test.go b/api/types/linkedin/linkedin_suite_test.go similarity index 100% rename from types/linkedin/linkedin_suite_test.go rename to api/types/linkedin/linkedin_suite_test.go diff --git a/types/linkedin/linkedin_test.go b/api/types/linkedin/linkedin_test.go similarity index 93% rename from types/linkedin/linkedin_test.go rename to api/types/linkedin/linkedin_test.go index 3ce35ae..88c26f0 100644 --- a/types/linkedin/linkedin_test.go +++ b/api/types/linkedin/linkedin_test.go @@ -4,11 +4,11 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-types/types/linkedin/experiences" - "github.com/masa-finance/tee-types/types/linkedin/functions" - "github.com/masa-finance/tee-types/types/linkedin/industries" - "github.com/masa-finance/tee-types/types/linkedin/seniorities" + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/types/linkedin/experiences" + "github.com/masa-finance/tee-worker/api/types/linkedin/functions" + "github.com/masa-finance/tee-worker/api/types/linkedin/industries" + "github.com/masa-finance/tee-worker/api/types/linkedin/seniorities" ) var _ = Describe("LinkedIn Types", func() { diff --git a/types/linkedin/profile/profile.go b/api/types/linkedin/profile/profile.go similarity index 99% rename from types/linkedin/profile/profile.go rename to api/types/linkedin/profile/profile.go index e74439e..024151d 100644 --- a/types/linkedin/profile/profile.go +++ b/api/types/linkedin/profile/profile.go @@ -3,7 +3,7 @@ package profile import ( "time" - "github.com/masa-finance/tee-types/pkg/util" + "github.com/masa-finance/tee-worker/pkg/util" ) type ScraperMode string diff --git a/types/linkedin/seniorities/seniorities.go b/api/types/linkedin/seniorities/seniorities.go similarity index 96% rename from types/linkedin/seniorities/seniorities.go rename to api/types/linkedin/seniorities/seniorities.go index 98e978c..4382c51 100644 --- a/types/linkedin/seniorities/seniorities.go +++ b/api/types/linkedin/seniorities/seniorities.go @@ -1,6 +1,6 @@ package seniorities -import "github.com/masa-finance/tee-types/pkg/util" +import "github.com/masa-finance/tee-worker/pkg/util" // id represents a LinkedIn seniority level identifier type Id string diff --git a/types/llm.go b/api/types/llm.go similarity index 100% rename from types/llm.go rename to api/types/llm.go diff --git a/types/reddit.go b/api/types/reddit.go similarity index 99% rename from types/reddit.go rename to api/types/reddit.go index 296557e..f9664fa 100644 --- a/types/reddit.go +++ b/api/types/reddit.go @@ -5,7 +5,7 @@ import ( "fmt" "time" - "github.com/masa-finance/tee-types/pkg/util" + "github.com/masa-finance/tee-worker/pkg/util" ) type RedditQueryType string diff --git a/api/types/reddit/reddit.go b/api/types/reddit/reddit.go deleted file mode 100644 index 6a5f7e7..0000000 --- a/api/types/reddit/reddit.go +++ /dev/null @@ -1,152 +0,0 @@ -package reddit - -import ( - "encoding/json" - "fmt" - "time" -) - -// TODO: These are duplicated here and in tee-types/types/reddit.go -type ResponseType string - -const ( - UserResponse ResponseType = "user" - PostResponse ResponseType = "post" - CommentResponse ResponseType = "comment" - CommunityResponse ResponseType = "community" -) - -// User represents the data structure for a Reddit user from the Apify scraper. -type User struct { - ID string `json:"id"` - URL string `json:"url"` - Username string `json:"username"` - UserIcon string `json:"userIcon"` - PostKarma int `json:"postKarma"` - CommentKarma int `json:"commentKarma"` - Description string `json:"description"` - Over18 bool `json:"over18"` - CreatedAt time.Time `json:"createdAt"` - ScrapedAt time.Time `json:"scrapedAt"` - DataType string `json:"dataType"` -} - -// Post represents the data structure for a Reddit post from the Apify scraper. -type Post struct { - ID string `json:"id"` - ParsedID string `json:"parsedId"` - URL string `json:"url"` - Username string `json:"username"` - Title string `json:"title"` - CommunityName string `json:"communityName"` - ParsedCommunityName string `json:"parsedCommunityName"` - Body string `json:"body"` - HTML *string `json:"html"` - NumberOfComments int `json:"numberOfComments"` - UpVotes int `json:"upVotes"` - IsVideo bool `json:"isVideo"` - IsAd bool `json:"isAd"` - Over18 bool `json:"over18"` - CreatedAt time.Time `json:"createdAt"` - ScrapedAt time.Time `json:"scrapedAt"` - DataType string `json:"dataType"` -} - -// Comment represents the data structure for a Reddit comment from the Apify scraper. -type Comment struct { - ID string `json:"id"` - ParsedID string `json:"parsedId"` - URL string `json:"url"` - ParentID string `json:"parentId"` - Username string `json:"username"` - Category string `json:"category"` - CommunityName string `json:"communityName"` - Body string `json:"body"` - CreatedAt time.Time `json:"createdAt"` - ScrapedAt time.Time `json:"scrapedAt"` - UpVotes int `json:"upVotes"` - NumberOfReplies int `json:"numberOfreplies"` - HTML string `json:"html"` - DataType string `json:"dataType"` -} - -// Community represents the data structure for a Reddit community from the Apify scraper. -type Community struct { - ID string `json:"id"` - Name string `json:"name"` - Title string `json:"title"` - HeaderImage string `json:"headerImage"` - Description string `json:"description"` - Over18 bool `json:"over18"` - CreatedAt time.Time `json:"createdAt"` - ScrapedAt time.Time `json:"scrapedAt"` - NumberOfMembers int `json:"numberOfMembers"` - URL string `json:"url"` - DataType string `json:"dataType"` -} - -type TypeSwitch struct { - Type ResponseType `json:"dataType"` -} - -type Response struct { - TypeSwitch *TypeSwitch - User *User - Post *Post - Comment *Comment - Community *Community -} - -func (t *Response) UnmarshalJSON(data []byte) error { - t.TypeSwitch = &TypeSwitch{} - if err := json.Unmarshal(data, &t.TypeSwitch); err != nil { - return fmt.Errorf("failed to unmarshal reddit response type: %w", err) - } - - switch t.TypeSwitch.Type { - case UserResponse: - t.User = &User{} - if err := json.Unmarshal(data, t.User); err != nil { - return fmt.Errorf("failed to unmarshal reddit user: %w", err) - } - case PostResponse: - t.Post = &Post{} - if err := json.Unmarshal(data, t.Post); err != nil { - return fmt.Errorf("failed to unmarshal reddit post: %w", err) - } - case CommentResponse: - t.Comment = &Comment{} - if err := json.Unmarshal(data, t.Comment); err != nil { - return fmt.Errorf("failed to unmarshal reddit comment: %w", err) - } - case CommunityResponse: - t.Community = &Community{} - if err := json.Unmarshal(data, t.Community); err != nil { - return fmt.Errorf("failed to unmarshal reddit community: %w", err) - } - default: - return fmt.Errorf("unknown Reddit response type during unmarshal: %s", t.TypeSwitch.Type) - } - return nil -} - -// MarshalJSON implements the json.Marshaler interface for Response. -// It unwraps the inner struct (User, Post, Comment, or Community) and marshals it directly. -func (t *Response) MarshalJSON() ([]byte, error) { - if t.TypeSwitch == nil { - return []byte("null"), nil - } - - switch t.TypeSwitch.Type { - case UserResponse: - return json.Marshal(t.User) - case PostResponse: - return json.Marshal(t.Post) - case CommentResponse: - return json.Marshal(t.Comment) - case CommunityResponse: - return json.Marshal(t.Community) - default: - return nil, fmt.Errorf("unknown Reddit response type during marshal: %s", t.TypeSwitch.Type) - } -} diff --git a/api/types/reddit/reddit_suite_test.go b/api/types/reddit/reddit_suite_test.go deleted file mode 100644 index 22d7fd6..0000000 --- a/api/types/reddit/reddit_suite_test.go +++ /dev/null @@ -1,13 +0,0 @@ -package reddit_test - -import ( - "testing" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -func TestReddit(t *testing.T) { - RegisterFailHandler(Fail) - RunSpecs(t, "Reddit Suite") -} \ No newline at end of file diff --git a/api/types/reddit/reddit_test.go b/api/types/reddit/reddit_test.go deleted file mode 100644 index d12319e..0000000 --- a/api/types/reddit/reddit_test.go +++ /dev/null @@ -1,150 +0,0 @@ -package reddit_test - -import ( - "encoding/json" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - "github.com/masa-finance/tee-worker/api/types/reddit" -) - -var _ = Describe("Response", func() { - Context("Unmarshalling JSON", func() { - It("should correctly unmarshal a UserResponse", func() { - jsonData := `{"dataType": "user", "id": "u1", "username": "testuser"}` - var resp reddit.Response - err := json.Unmarshal([]byte(jsonData), &resp) - Expect(err).NotTo(HaveOccurred()) - Expect(resp.User).NotTo(BeNil()) - Expect(resp.User.ID).To(Equal("u1")) - Expect(resp.User.Username).To(Equal("testuser")) - Expect(resp.Post).To(BeNil()) - Expect(resp.Comment).To(BeNil()) - Expect(resp.Community).To(BeNil()) - }) - - It("should correctly unmarshal a PostResponse", func() { - jsonData := `{"dataType": "post", "id": "p1", "title": "Test Post"}` - var resp reddit.Response - err := json.Unmarshal([]byte(jsonData), &resp) - Expect(err).NotTo(HaveOccurred()) - Expect(resp.Post).NotTo(BeNil()) - Expect(resp.Post.ID).To(Equal("p1")) - Expect(resp.Post.Title).To(Equal("Test Post")) - Expect(resp.User).To(BeNil()) - Expect(resp.Comment).To(BeNil()) - Expect(resp.Community).To(BeNil()) - }) - - It("should correctly unmarshal a CommentResponse", func() { - jsonData := `{"dataType": "comment", "id": "c1", "body": "Test Comment"}` - var resp reddit.Response - err := json.Unmarshal([]byte(jsonData), &resp) - Expect(err).NotTo(HaveOccurred()) - Expect(resp.Comment).NotTo(BeNil()) - Expect(resp.Comment.ID).To(Equal("c1")) - Expect(resp.Comment.Body).To(Equal("Test Comment")) - Expect(resp.User).To(BeNil()) - Expect(resp.Post).To(BeNil()) - Expect(resp.Community).To(BeNil()) - }) - - It("should correctly unmarshal a CommunityResponse", func() { - jsonData := `{"dataType": "community", "id": "co1", "name": "Test Community"}` - var resp reddit.Response - err := json.Unmarshal([]byte(jsonData), &resp) - Expect(err).NotTo(HaveOccurred()) - Expect(resp.Community).NotTo(BeNil()) - Expect(resp.Community.ID).To(Equal("co1")) - Expect(resp.Community.Name).To(Equal("Test Community")) - Expect(resp.User).To(BeNil()) - Expect(resp.Post).To(BeNil()) - Expect(resp.Comment).To(BeNil()) - }) - - It("should return an error for an unknown type", func() { - jsonData := `{"dataType": "unknown", "id": "u1"}` - var resp reddit.Response - err := json.Unmarshal([]byte(jsonData), &resp) - Expect(err).To(HaveOccurred()) - Expect(err.Error()).To(ContainSubstring("unknown Reddit response type during unmarshal: unknown")) - }) - - It("should return an error for invalid JSON", func() { - jsonData := `{"type": "user", "id": "u1"` - var resp reddit.Response - err := json.Unmarshal([]byte(jsonData), &resp) - Expect(err).To(HaveOccurred()) - }) - }) - - Context("Marshalling JSON", func() { - It("should correctly marshal a UserResponse", func() { - resp := reddit.Response{ - TypeSwitch: &reddit.TypeSwitch{Type: reddit.UserResponse}, - User: &reddit.User{ID: "u1", Username: "testuser", DataType: "user"}, - } - jsonData, err := json.Marshal(&resp) - Expect(err).NotTo(HaveOccurred()) - expectedJSON := `{"id":"u1","url":"","username":"testuser","userIcon":"","postKarma":0,"commentKarma":0,"description":"","over18":false,"createdAt":"0001-01-01T00:00:00Z","scrapedAt":"0001-01-01T00:00:00Z","dataType":"user"}` - Expect(jsonData).To(MatchJSON(expectedJSON)) - }) - - It("should correctly marshal a PostResponse", func() { - resp := reddit.Response{ - TypeSwitch: &reddit.TypeSwitch{Type: reddit.PostResponse}, - Post: &reddit.Post{ID: "p1", Title: "Test Post", DataType: "post"}, - } - jsonData, err := json.Marshal(&resp) - Expect(err).NotTo(HaveOccurred()) - expectedJSON := `{"id":"p1","parsedId":"","url":"","username":"","title":"Test Post","communityName":"","parsedCommunityName":"","body":"","html":null,"numberOfComments":0,"upVotes":0,"isVideo":false,"isAd":false,"over18":false,"createdAt":"0001-01-01T00:00:00Z","scrapedAt":"0001-01-01T00:00:00Z","dataType":"post"}` - Expect(jsonData).To(MatchJSON(expectedJSON)) - }) - - It("should correctly marshal a CommentResponse", func() { - now := time.Now().UTC() - resp := reddit.Response{ - TypeSwitch: &reddit.TypeSwitch{Type: reddit.CommentResponse}, - Comment: &reddit.Comment{ID: "c1", Body: "Test Comment", CreatedAt: now, ScrapedAt: now, DataType: "comment"}, - } - jsonData, err := json.Marshal(&resp) - Expect(err).NotTo(HaveOccurred()) - - expectedComment := &reddit.Comment{ID: "c1", Body: "Test Comment", CreatedAt: now, ScrapedAt: now, DataType: "comment"} - expectedJSON, _ := json.Marshal(expectedComment) - Expect(jsonData).To(MatchJSON(expectedJSON)) - }) - - It("should correctly marshal a CommunityResponse", func() { - now := time.Now().UTC() - resp := reddit.Response{ - TypeSwitch: &reddit.TypeSwitch{Type: reddit.CommunityResponse}, - Community: &reddit.Community{ID: "co1", Name: "Test Community", CreatedAt: now, ScrapedAt: now, DataType: "community"}, - } - jsonData, err := json.Marshal(&resp) - Expect(err).NotTo(HaveOccurred()) - - expectedCommunity := &reddit.Community{ID: "co1", Name: "Test Community", CreatedAt: now, ScrapedAt: now, DataType: "community"} - expectedJSON, _ := json.Marshal(expectedCommunity) - Expect(jsonData).To(MatchJSON(expectedJSON)) - }) - - It("should return an error for an unknown type", func() { - resp := reddit.Response{ - TypeSwitch: &reddit.TypeSwitch{Type: "unknown"}, - } - _, err := json.Marshal(&resp) - Expect(err).To(HaveOccurred()) - Expect(err.Error()).To(ContainSubstring("unknown Reddit response type during marshal: unknown")) - }) - - It("should marshal to null if TypeSwitch is nil", func() { - resp := reddit.Response{} - jsonData, err := json.Marshal(&resp) - Expect(err).NotTo(HaveOccurred()) - Expect(string(jsonData)).To(Equal("null")) - }) - }) -}) diff --git a/types/reddit_test.go b/api/types/reddit_test.go similarity index 98% rename from types/reddit_test.go rename to api/types/reddit_test.go index c6cf7a3..c7adc53 100644 --- a/types/reddit_test.go +++ b/api/types/reddit_test.go @@ -7,7 +7,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/types" + "github.com/masa-finance/tee-worker/api/types" ) var _ = Describe("RedditResponse", func() { diff --git a/types/tiktok.go b/api/types/tiktok.go similarity index 100% rename from types/tiktok.go rename to api/types/tiktok.go diff --git a/types/twitter.go b/api/types/twitter.go similarity index 100% rename from types/twitter.go rename to api/types/twitter.go diff --git a/api/types/types.go b/api/types/types.go new file mode 100644 index 0000000..d6b47ab --- /dev/null +++ b/api/types/types.go @@ -0,0 +1,7 @@ +package types + +import ( + linkedin "github.com/masa-finance/tee-worker/api/types/linkedin" +) + +var LinkedIn = linkedin.LinkedIn diff --git a/types/types_suite_test.go b/api/types/types_suite_test.go similarity index 100% rename from types/types_suite_test.go rename to api/types/types_suite_test.go diff --git a/types/web.go b/api/types/web.go similarity index 100% rename from types/web.go rename to api/types/web.go diff --git a/go.sum b/go.sum index 410d9d8..573a0d6 100644 --- a/go.sum +++ b/go.sum @@ -44,8 +44,6 @@ github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0 github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= -github.com/masa-finance/tee-types v1.2.0 h1:RqyDMlDY0XCXAw6XQWZ+4R4az4AC+C5r30cmhtHfhf0= -github.com/masa-finance/tee-types v1.2.0/go.mod h1:sB98t0axFlPi2d0zUPFZSQ84mPGwbr9eRY5yLLE3fSc= github.com/masa-finance/twitter-scraper v1.0.2 h1:him+wvYZHg/7EDdy73z1ceUywDJDRAhPLD2CSEa2Vfk= github.com/masa-finance/twitter-scraper v1.0.2/go.mod h1:38MY3g/h4V7Xl4HbW9lnkL8S3YiFZenBFv86hN57RG8= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 1bb8146..021539a 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -10,7 +10,7 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/api" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/apify/actors.go b/internal/apify/actors.go index 5a188f1..43da8cb 100644 --- a/internal/apify/actors.go +++ b/internal/apify/actors.go @@ -1,6 +1,6 @@ package apify -import teetypes "github.com/masa-finance/tee-types/types" +import teetypes "github.com/masa-finance/tee-worker/api/types" type ActorId string diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index 6ccfd4a..4ae7003 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -6,8 +6,8 @@ import ( "maps" - util "github.com/masa-finance/tee-types/pkg/util" - teetypes "github.com/masa-finance/tee-types/types" + util "github.com/masa-finance/tee-worker/pkg/util" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/twitter" diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index 7c263ad..c4fd55c 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -7,7 +7,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" ) diff --git a/internal/config/config.go b/internal/config/config.go index 9bd4d23..4aec428 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,7 +11,7 @@ import ( "time" "github.com/joho/godotenv" - teeargs "github.com/masa-finance/tee-types/args" + teeargs "github.com/masa-finance/tee-worker/api/args" "github.com/sirupsen/logrus" ) diff --git a/internal/jobs/linkedin.go b/internal/jobs/linkedin.go index 933b1d2..05a3a34 100644 --- a/internal/jobs/linkedin.go +++ b/internal/jobs/linkedin.go @@ -13,10 +13,10 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - profileArgs "github.com/masa-finance/tee-types/args/linkedin/profile" - teetypes "github.com/masa-finance/tee-types/types" - profileTypes "github.com/masa-finance/tee-types/types/linkedin/profile" + teeargs "github.com/masa-finance/tee-worker/api/args" + profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" + teetypes "github.com/masa-finance/tee-worker/api/types" + profileTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" ) // LinkedInApifyClient defines the interface for the LinkedIn Apify client to allow mocking in tests diff --git a/internal/jobs/linkedin_test.go b/internal/jobs/linkedin_test.go index dc5763e..3f50c8f 100644 --- a/internal/jobs/linkedin_test.go +++ b/internal/jobs/linkedin_test.go @@ -17,9 +17,9 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - profileArgs "github.com/masa-finance/tee-types/args/linkedin/profile" - teetypes "github.com/masa-finance/tee-types/types" - profileTypes "github.com/masa-finance/tee-types/types/linkedin/profile" + profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" + teetypes "github.com/masa-finance/tee-worker/api/types" + profileTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" ) // MockLinkedInApifyClient is a mock implementation of the LinkedInApifyClient. diff --git a/internal/jobs/linkedinapify/client.go b/internal/jobs/linkedinapify/client.go index 0124a37..adb55d3 100644 --- a/internal/jobs/linkedinapify/client.go +++ b/internal/jobs/linkedinapify/client.go @@ -4,8 +4,8 @@ import ( "encoding/json" "fmt" - profileArgs "github.com/masa-finance/tee-types/args/linkedin/profile" - profileTypes "github.com/masa-finance/tee-types/types/linkedin/profile" + profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" + profileTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/jobs/linkedinapify/client_test.go b/internal/jobs/linkedinapify/client_test.go index b17b73f..5910204 100644 --- a/internal/jobs/linkedinapify/client_test.go +++ b/internal/jobs/linkedinapify/client_test.go @@ -13,9 +13,9 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - profileArgs "github.com/masa-finance/tee-types/args/linkedin/profile" - "github.com/masa-finance/tee-types/types" - "github.com/masa-finance/tee-types/types/linkedin/profile" + profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/types/linkedin/profile" ) // MockApifyClient is a mock implementation of the ApifyClient. diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index 48cb9f9..37b0448 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -5,8 +5,8 @@ import ( "errors" "fmt" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 417e88c..9479cd7 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -15,8 +15,8 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/llmapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockApifyClient is a mock implementation of the ApifyClient. diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index f0bcb51..937f069 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -10,14 +10,14 @@ import ( "github.com/sirupsen/logrus" "github.com/masa-finance/tee-worker/api/types" - "github.com/masa-finance/tee-worker/api/types/reddit" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // RedditApifyClient defines the interface for the Reddit Apify client. diff --git a/internal/jobs/reddit_test.go b/internal/jobs/reddit_test.go index 36ab11e..ebc1598 100644 --- a/internal/jobs/reddit_test.go +++ b/internal/jobs/reddit_test.go @@ -10,14 +10,14 @@ import ( "github.com/sirupsen/logrus" "github.com/masa-finance/tee-worker/api/types" - "github.com/masa-finance/tee-worker/api/types/reddit" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockRedditApifyClient is a mock implementation of the RedditApifyClient. diff --git a/internal/jobs/redditapify/client.go b/internal/jobs/redditapify/client.go index e90e7e7..008795a 100644 --- a/internal/jobs/redditapify/client.go +++ b/internal/jobs/redditapify/client.go @@ -7,13 +7,13 @@ import ( "github.com/sirupsen/logrus" - "github.com/masa-finance/tee-worker/api/types/reddit" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // CommonArgs holds the parameters that all Reddit searches support, in a single struct diff --git a/internal/jobs/redditapify/client_test.go b/internal/jobs/redditapify/client_test.go index 1712157..c4bea35 100644 --- a/internal/jobs/redditapify/client_test.go +++ b/internal/jobs/redditapify/client_test.go @@ -12,8 +12,8 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockApifyClient is a mock implementation of the ApifyClient. diff --git a/internal/jobs/stats/stats.go b/internal/jobs/stats/stats.go index dd449de..8aaeab8 100644 --- a/internal/jobs/stats/stats.go +++ b/internal/jobs/stats/stats.go @@ -5,7 +5,7 @@ import ( "sync" "time" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/versioning" diff --git a/internal/jobs/telemetry.go b/internal/jobs/telemetry.go index 837e188..aae760b 100644 --- a/internal/jobs/telemetry.go +++ b/internal/jobs/telemetry.go @@ -1,7 +1,7 @@ package jobs import ( - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" diff --git a/internal/jobs/telemetry_test.go b/internal/jobs/telemetry_test.go index 7c2b473..9d877c0 100644 --- a/internal/jobs/telemetry_test.go +++ b/internal/jobs/telemetry_test.go @@ -8,7 +8,7 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" . "github.com/masa-finance/tee-worker/internal/jobs" diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index f1e327e..fad73bb 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -10,8 +10,8 @@ import ( "strings" "time" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" diff --git a/internal/jobs/tiktok_test.go b/internal/jobs/tiktok_test.go index cb8973c..8ec3780 100644 --- a/internal/jobs/tiktok_test.go +++ b/internal/jobs/tiktok_test.go @@ -10,7 +10,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" . "github.com/masa-finance/tee-worker/internal/jobs" diff --git a/internal/jobs/tiktokapify/client.go b/internal/jobs/tiktokapify/client.go index 6ad22f2..9fda59d 100644 --- a/internal/jobs/tiktokapify/client.go +++ b/internal/jobs/tiktokapify/client.go @@ -4,8 +4,8 @@ import ( "encoding/json" "fmt" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" ) diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index a7cecb2..793a567 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -8,8 +8,8 @@ import ( "strings" "time" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/jobs/twitterx" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/jobs/twitter_test.go b/internal/jobs/twitter_test.go index 0858ff4..04022a1 100644 --- a/internal/jobs/twitter_test.go +++ b/internal/jobs/twitter_test.go @@ -7,7 +7,7 @@ import ( "strings" "time" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" diff --git a/internal/jobs/twitterapify/client.go b/internal/jobs/twitterapify/client.go index cbdddca..7346639 100644 --- a/internal/jobs/twitterapify/client.go +++ b/internal/jobs/twitterapify/client.go @@ -4,8 +4,8 @@ import ( "encoding/json" "fmt" - util "github.com/masa-finance/tee-types/pkg/util" - teetypes "github.com/masa-finance/tee-types/types" + util "github.com/masa-finance/tee-worker/pkg/util" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" "github.com/sirupsen/logrus" diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 75d0d0d..2988d63 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -14,9 +14,9 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - "github.com/masa-finance/tee-types/pkg/util" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/pkg/util" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // WebApifyClient defines the interface for the Web Apify client to allow mocking in tests diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index c11de05..ec23026 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -16,8 +16,8 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockWebApifyClient is a mock implementation of the WebApifyClient. diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index e59d550..7ed29cf 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -4,8 +4,8 @@ import ( "encoding/json" "fmt" - teeargs "github.com/masa-finance/tee-types/args" - teetypes "github.com/masa-finance/tee-types/types" + teeargs "github.com/masa-finance/tee-worker/api/args" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index a23867d..aaefcb6 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -12,7 +12,7 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-types/args" + teeargs "github.com/masa-finance/tee-worker/api/args" ) // MockApifyClient is a mock implementation of the ApifyClient. diff --git a/internal/jobserver/jobserver.go b/internal/jobserver/jobserver.go index 1ace244..e9af738 100644 --- a/internal/jobserver/jobserver.go +++ b/internal/jobserver/jobserver.go @@ -11,7 +11,7 @@ import ( "golang.org/x/exp/maps" "github.com/google/uuid" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" diff --git a/internal/jobserver/jobserver_test.go b/internal/jobserver/jobserver_test.go index b2e64a7..b908a37 100644 --- a/internal/jobserver/jobserver_test.go +++ b/internal/jobserver/jobserver_test.go @@ -8,7 +8,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" diff --git a/internal/jobserver/worker.go b/internal/jobserver/worker.go index 0c77382..939b199 100644 --- a/internal/jobserver/worker.go +++ b/internal/jobserver/worker.go @@ -4,7 +4,7 @@ import ( "context" "fmt" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/sirupsen/logrus" ) diff --git a/pkg/client/http.go b/pkg/client/http.go index 7d05b9e..7bb4f81 100644 --- a/pkg/client/http.go +++ b/pkg/client/http.go @@ -8,7 +8,7 @@ import ( "net/http" "time" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" ) diff --git a/pkg/client/http_test.go b/pkg/client/http_test.go index 9eaeee5..0a015e8 100644 --- a/pkg/client/http_test.go +++ b/pkg/client/http_test.go @@ -5,7 +5,7 @@ import ( "net/http" "net/http/httptest" - teetypes "github.com/masa-finance/tee-types/types" + teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/pkg/client" . "github.com/onsi/ginkgo/v2" diff --git a/pkg/util/math_test.go b/pkg/util/math_test.go index 99c82c9..3213360 100644 --- a/pkg/util/math_test.go +++ b/pkg/util/math_test.go @@ -4,7 +4,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/pkg/util" + "github.com/masa-finance/tee-worker/pkg/util" ) var _ = Describe("Math functions", func() { diff --git a/pkg/util/set_test.go b/pkg/util/set_test.go index b03fc42..a096a45 100644 --- a/pkg/util/set_test.go +++ b/pkg/util/set_test.go @@ -6,7 +6,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-types/pkg/util" + "github.com/masa-finance/tee-worker/pkg/util" ) var _ = Describe("Set", func() { diff --git a/types/types.go b/types/types.go deleted file mode 100644 index 64b43d0..0000000 --- a/types/types.go +++ /dev/null @@ -1,7 +0,0 @@ -package types - -import ( - linkedin "github.com/masa-finance/tee-types/types/linkedin" -) - -var LinkedIn = linkedin.LinkedIn From b5ee3e9c8324954bf59832cb2ba87122d8d05aaa Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 20:56:01 +0200 Subject: [PATCH 109/136] Consolidate job files and isolate tee functionality - Merged job.go and jobs.go into a single jobs.go file - Created api/tee package to isolate tee-specific functionality - Moved GenerateJobSignature, SealJobResult, DecryptJob to api/tee package - Moved EncryptedRequest and related types to api/tee package - Updated routes.go to use the new api/tee package - Types and args packages now build without ego dependency - Clients can import types/args without needing ego library --- api/tee/encrypted.go | 35 +++++++++++++++++++++++ api/tee/job.go | 63 ++++++++++++++++++++++++++++++++++++++++++ api/types/jobs.go | 43 ++++++++++++++++++++++++++-- internal/api/routes.go | 5 ++-- 4 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 api/tee/encrypted.go create mode 100644 api/tee/job.go diff --git a/api/tee/encrypted.go b/api/tee/encrypted.go new file mode 100644 index 0000000..aab4ecb --- /dev/null +++ b/api/tee/encrypted.go @@ -0,0 +1,35 @@ +package tee + +import ( + "encoding/json" + "fmt" + + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/pkg/tee" +) + +// EncryptedRequest represents an encrypted request/response pair +type EncryptedRequest struct { + EncryptedResult string `json:"encrypted_result"` + EncryptedRequest string `json:"encrypted_request"` +} + +// Unseal decrypts the encrypted request and result +func (payload EncryptedRequest) Unseal() (string, error) { + jobRequest, err := tee.Unseal(payload.EncryptedRequest) + if err != nil { + return "", fmt.Errorf("error while unsealing the encrypted request: %w", err) + } + + job := types.Job{} + if err := json.Unmarshal(jobRequest, &job); err != nil { + return "", fmt.Errorf("error while unmarshalling the job request: %w", err) + } + + dat, err := tee.UnsealWithKey(job.Nonce, payload.EncryptedResult) + if err != nil { + return "", fmt.Errorf("error while unsealing the job result: %w", err) + } + + return string(dat), nil +} diff --git a/api/tee/job.go b/api/tee/job.go new file mode 100644 index 0000000..efe2fa1 --- /dev/null +++ b/api/tee/job.go @@ -0,0 +1,63 @@ +package tee + +import ( + "crypto/sha256" + "encoding/json" + "fmt" + "time" + + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/pkg/tee" + "golang.org/x/exp/rand" +) + +var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") + +func randStringRunes(n int) string { + b := make([]rune, n) + for i := range b { + // TODO: Move xcrypt from indexer to tee-types, and use RandomString here (although we'll need a different alpahbet) + b[i] = letterRunes[rand.Intn(len(letterRunes))] + } + return string(b) +} + +// GenerateJobSignature generates a signature for the job. +func GenerateJobSignature(job *types.Job) (string, error) { + dat, err := json.Marshal(job) + if err != nil { + return "", err + } + + checksum := sha256.New() + checksum.Write(dat) + + job.Nonce = fmt.Sprintf("%s-%s", string(checksum.Sum(nil)), randStringRunes(99)) + + dat, err = json.Marshal(job) + if err != nil { + return "", err + } + + return tee.Seal(dat) +} + +// SealJobResult seals a job result with the job's nonce. +func SealJobResult(jr *types.JobResult) (string, error) { + return tee.SealWithKey(jr.Job.Nonce, jr.Data) +} + +// DecryptJob decrypts the job request. +func DecryptJob(jobRequest *types.JobRequest) (*types.Job, error) { + dat, err := tee.Unseal(jobRequest.EncryptedJob) + if err != nil { + return nil, err + } + + job := types.Job{} + if err := json.Unmarshal(dat, &job); err != nil { + return nil, err + } + + return &job, nil +} diff --git a/api/types/jobs.go b/api/types/jobs.go index c35faae..d9a49b0 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -13,12 +13,12 @@ type JobType string type JobArguments map[string]interface{} -func (j JobArguments) Unmarshal(i interface{}) error { - d, err := json.Marshal(j) +func (ja JobArguments) Unmarshal(i interface{}) error { + dat, err := json.Marshal(ja) if err != nil { return err } - return json.Unmarshal(d, i) + return json.Unmarshal(dat, i) } type Job struct { @@ -31,6 +31,10 @@ type Job struct { Timeout time.Duration `json:"timeout"` } +func (j Job) String() string { + return fmt.Sprintf("UUID: %s Type: %s Arguments: %s", j.UUID, j.Type, j.Arguments) +} + type Capability string type WorkerCapabilities map[JobType][]Capability @@ -193,3 +197,36 @@ var JobDefaultCapabilityMap = map[JobType]Capability{ TelemetryJob: CapTelemetry, LinkedInJob: CapSearchByProfile, } + +// JobResponse represents a response to a job submission +type JobResponse struct { + UID string `json:"uid"` +} + +// JobResult represents the result of a job execution +type JobResult struct { + Error string `json:"error"` + Data []byte `json:"data"` + Job Job `json:"job"` + NextCursor string `json:"next_cursor"` +} + +// Success returns true if the job was successful. +func (jr JobResult) Success() bool { + return jr.Error == "" +} + +// Unmarshal unmarshals the job result data. +func (jr JobResult) Unmarshal(i interface{}) error { + return json.Unmarshal(jr.Data, i) +} + +// JobRequest represents a request to execute a job +type JobRequest struct { + EncryptedJob string `json:"encrypted_job"` +} + +// JobError represents an error in job execution +type JobError struct { + Error string `json:"error"` +} diff --git a/internal/api/routes.go b/internal/api/routes.go index 7c631b5..eeed93a 100644 --- a/internal/api/routes.go +++ b/internal/api/routes.go @@ -5,6 +5,7 @@ import ( "net/http" "github.com/labstack/echo/v4" + teejob "github.com/masa-finance/tee-worker/api/tee" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/jobserver" "github.com/masa-finance/tee-worker/pkg/tee" @@ -21,7 +22,7 @@ func generate(c echo.Context) error { job.WorkerID = tee.WorkerID // attach worker ID to job - encryptedSignature, err := job.GenerateJobSignature() + encryptedSignature, err := teejob.GenerateJobSignature(job) if err != nil { logrus.Errorf("Error while generating job signature: %s", err) return c.JSON(http.StatusInternalServerError, types.JobError{Error: err.Error()}) @@ -46,7 +47,7 @@ func add(jobServer *jobserver.JobServer) func(c echo.Context) error { return c.JSON(http.StatusBadRequest, types.JobError{Error: err.Error()}) } - job, err := jobRequest.DecryptJob() + job, err := teejob.DecryptJob(&jobRequest) if err != nil { logrus.Errorf("Error while decrypting job %s: %s", jobRequest, err) return c.JSON(http.StatusInternalServerError, types.JobError{Error: fmt.Sprintf("Error while decrypting job: %s", err.Error())}) From 61b37a2082c8b5e9b6485ba50a39d09246afdaf5 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 21:10:34 +0200 Subject: [PATCH 110/136] Fix import issues from refactor - Updated pkg/client/http.go to import api/tee for EncryptedRequest - Fixed teetypes.Job reference to use types.Job - Updated internal/api/routes.go to use teejob.EncryptedRequest - Resolved compilation errors from the consolidation --- api/tee/job.go | 5 ++--- internal/api/routes.go | 2 +- pkg/client/http.go | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/api/tee/job.go b/api/tee/job.go index efe2fa1..21db007 100644 --- a/api/tee/job.go +++ b/api/tee/job.go @@ -4,11 +4,10 @@ import ( "crypto/sha256" "encoding/json" "fmt" - "time" + "math/rand/v2" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/pkg/tee" - "golang.org/x/exp/rand" ) var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+") @@ -17,7 +16,7 @@ func randStringRunes(n int) string { b := make([]rune, n) for i := range b { // TODO: Move xcrypt from indexer to tee-types, and use RandomString here (although we'll need a different alpahbet) - b[i] = letterRunes[rand.Intn(len(letterRunes))] + b[i] = letterRunes[rand.IntN(len(letterRunes))] } return string(b) } diff --git a/internal/api/routes.go b/internal/api/routes.go index eeed93a..f8efcb9 100644 --- a/internal/api/routes.go +++ b/internal/api/routes.go @@ -97,7 +97,7 @@ func status(jobServer *jobserver.JobServer) func(c echo.Context) error { } func result(c echo.Context) error { - payload := types.EncryptedRequest{ + payload := teejob.EncryptedRequest{ EncryptedResult: "", EncryptedRequest: "", } diff --git a/pkg/client/http.go b/pkg/client/http.go index 7bb4f81..0809b73 100644 --- a/pkg/client/http.go +++ b/pkg/client/http.go @@ -8,7 +8,7 @@ import ( "net/http" "time" - teetypes "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/tee" "github.com/masa-finance/tee-worker/api/types" ) @@ -44,7 +44,7 @@ func NewClient(baseURL string, opts ...Option) (*Client, error) { // CreateJobSignature sends a job to the server to generate a job signature. // The server will attach its worker ID to the job before generating the signature. -func (c *Client) CreateJobSignature(job teetypes.Job) (JobSignature, error) { +func (c *Client) CreateJobSignature(job types.Job) (JobSignature, error) { jobJSON, err := json.Marshal(job) if err != nil { return JobSignature(""), fmt.Errorf("error marshaling job: %w", err) @@ -115,7 +115,7 @@ func (c *Client) SubmitJob(JobSignature JobSignature) (*JobResult, error) { // Decrypt sends the encrypted result to the server to decrypt it. func (c *Client) Decrypt(JobSignature JobSignature, encryptedResult string) (string, error) { - decryptReq := types.EncryptedRequest{ + decryptReq := tee.EncryptedRequest{ EncryptedResult: encryptedResult, EncryptedRequest: string(JobSignature), } From a8d6bd15edc8108881e809afe61821bcbcb6fe4c Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 21:13:54 +0200 Subject: [PATCH 111/136] Fix remaining import issues in internal packages - Fixed duplicate imports (teetypes/teeargs aliases) across all internal files - Updated all teetypes.* and teeargs.* references to use direct types.* and args.* - Added missing imports to internal/apify/actors.go and internal/config/config.go - Fixed reddit package path references - All internal packages now build without import errors --- internal/api/api_test.go | 11 +- internal/apify/actors.go | 32 ++-- internal/capabilities/detector.go | 39 ++-- internal/capabilities/detector_test.go | 89 +++++---- internal/config/config.go | 7 +- internal/jobs/linkedin.go | 14 +- internal/jobs/linkedin_test.go | 37 ++-- internal/jobs/llmapify/client.go | 8 +- internal/jobs/llmapify/client_test.go | 28 ++- internal/jobs/reddit.go | 30 ++- internal/jobs/reddit_test.go | 21 +-- internal/jobs/redditapify/client.go | 45 +++-- internal/jobs/redditapify/client_test.go | 20 +- internal/jobs/stats/stats.go | 5 +- internal/jobs/telemetry.go | 7 +- internal/jobs/telemetry_test.go | 7 +- internal/jobs/tiktok.go | 34 ++-- internal/jobs/tiktok_test.go | 27 ++- internal/jobs/tiktokapify/client.go | 14 +- internal/jobs/twitter.go | 224 +++++++++++------------ internal/jobs/twitter_test.go | 169 +++++++++-------- internal/jobs/twitterapify/client.go | 11 +- internal/jobs/web.go | 26 ++- internal/jobs/web_test.go | 42 ++--- internal/jobs/webapify/client.go | 8 +- internal/jobs/webapify/client_test.go | 11 +- internal/jobserver/jobserver.go | 33 ++-- internal/jobserver/jobserver_test.go | 11 +- internal/jobserver/worker.go | 3 +- 29 files changed, 489 insertions(+), 524 deletions(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 021539a..3eeeedf 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -10,7 +10,6 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - teetypes "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/api" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/pkg/client" @@ -44,8 +43,8 @@ var _ = Describe("API", func() { return err } - signature, err := c.CreateJobSignature(teetypes.Job{ - Type: teetypes.WebJob, + signature, err := c.CreateJobSignature(types.Job{ + Type: types.WebJob, Arguments: map[string]interface{}{}, }) if err != nil { @@ -72,8 +71,8 @@ var _ = Describe("API", func() { It("should submit a job and get the correct result", func() { // Step 1: Create the job request // we use TikTok transcription here as it's supported by all workers without any unique config - job := teetypes.Job{ - Type: teetypes.TiktokJob, + job := types.Job{ + Type: types.TiktokJob, Arguments: map[string]interface{}{ "type": "transcription", "video_url": "https://www.tiktok.com/@theblockrunner.com/video/7227579907361066282", @@ -107,7 +106,7 @@ var _ = Describe("API", func() { It("bubble up errors", func() { // Step 1: Create the job request - job := teetypes.Job{ + job := types.Job{ Type: "not-existing scraper", Arguments: map[string]interface{}{ "url": "google", diff --git a/internal/apify/actors.go b/internal/apify/actors.go index 43da8cb..75631a1 100644 --- a/internal/apify/actors.go +++ b/internal/apify/actors.go @@ -1,6 +1,8 @@ package apify -import teetypes "github.com/masa-finance/tee-worker/api/types" +import ( + "github.com/masa-finance/tee-worker/api/types" +) type ActorId string @@ -29,8 +31,8 @@ type defaultActorInput map[string]any type ActorConfig struct { ActorId ActorId DefaultInput defaultActorInput - Capabilities []teetypes.Capability - JobType teetypes.JobType + Capabilities []types.Capability + JobType types.JobType } // Actors is a list of actor configurations for Apify. Omitting LLM for now as it's not a standalone actor / has no dedicated capabilities @@ -38,37 +40,37 @@ var Actors = []ActorConfig{ { ActorId: ActorIds.RedditScraper, DefaultInput: defaultActorInput{}, - Capabilities: teetypes.RedditCaps, - JobType: teetypes.RedditJob, + Capabilities: types.RedditCaps, + JobType: types.RedditJob, }, { ActorId: ActorIds.TikTokSearchScraper, DefaultInput: defaultActorInput{"proxy": map[string]any{"useApifyProxy": true}}, - Capabilities: []teetypes.Capability{teetypes.CapSearchByQuery}, - JobType: teetypes.TiktokJob, + Capabilities: []types.Capability{types.CapSearchByQuery}, + JobType: types.TiktokJob, }, { ActorId: ActorIds.TikTokTrendingScraper, DefaultInput: defaultActorInput{}, - Capabilities: []teetypes.Capability{teetypes.CapSearchByTrending}, - JobType: teetypes.TiktokJob, + Capabilities: []types.Capability{types.CapSearchByTrending}, + JobType: types.TiktokJob, }, { ActorId: ActorIds.TwitterFollowers, DefaultInput: defaultActorInput{"maxFollowers": 200, "maxFollowings": 200}, - Capabilities: teetypes.TwitterApifyCaps, - JobType: teetypes.TwitterApifyJob, + Capabilities: types.TwitterApifyCaps, + JobType: types.TwitterApifyJob, }, { ActorId: ActorIds.WebScraper, DefaultInput: defaultActorInput{"startUrls": []map[string]any{{"url": "https://docs.learnbittensor.org"}}}, - Capabilities: teetypes.WebCaps, - JobType: teetypes.WebJob, + Capabilities: types.WebCaps, + JobType: types.WebJob, }, { ActorId: ActorIds.LinkedInSearchProfile, DefaultInput: defaultActorInput{}, - Capabilities: teetypes.LinkedInCaps, - JobType: teetypes.LinkedInJob, + Capabilities: types.LinkedInCaps, + JobType: types.LinkedInJob, }, } diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index 4ae7003..3d97de8 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -7,7 +7,6 @@ import ( "maps" util "github.com/masa-finance/tee-worker/pkg/util" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/twitter" @@ -17,18 +16,18 @@ import ( // JobServerInterface defines the methods we need from JobServer to avoid circular dependencies type JobServerInterface interface { - GetWorkerCapabilities() teetypes.WorkerCapabilities + GetWorkerCapabilities() types.WorkerCapabilities } // DetectCapabilities automatically detects available capabilities based on configuration // Always performs real capability detection by probing APIs and actors to ensure accurate reporting -func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface) teetypes.WorkerCapabilities { +func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface) types.WorkerCapabilities { // Always perform real capability detection to ensure accurate reporting // This guarantees miners report only capabilities they actually have access to - capabilities := make(teetypes.WorkerCapabilities) + capabilities := make(types.WorkerCapabilities) // Start with always available capabilities - maps.Copy(capabilities, teetypes.AlwaysAvailableCapabilities) + maps.Copy(capabilities, types.AlwaysAvailableCapabilities) // Check what Twitter authentication methods are available accounts := jc.GetStringSlice("twitter_accounts", nil) @@ -44,20 +43,20 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface // Add Twitter-specific capabilities based on available authentication if hasAccounts { - capabilities[teetypes.TwitterCredentialJob] = teetypes.TwitterCredentialCaps + capabilities[types.TwitterCredentialJob] = types.TwitterCredentialCaps } if hasApiKeys { // Start with basic API capabilities - apiCaps := make([]teetypes.Capability, len(teetypes.TwitterAPICaps)) - copy(apiCaps, teetypes.TwitterAPICaps) + apiCaps := make([]types.Capability, len(types.TwitterAPICaps)) + copy(apiCaps, types.TwitterAPICaps) // Check for elevated API keys and add searchbyfullarchive capability if hasElevatedApiKey(apiKeys) { - apiCaps = append(apiCaps, teetypes.CapSearchByFullArchive) + apiCaps = append(apiCaps, types.CapSearchByFullArchive) } - capabilities[teetypes.TwitterApiJob] = apiCaps + capabilities[types.TwitterApiJob] = apiCaps } if hasApifyKey { @@ -67,18 +66,18 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface logrus.Errorf("Failed to create Apify client for access probes: %v", err) } else { // Aggregate capabilities per job from accessible actors - jobToSet := map[teetypes.JobType]*util.Set[teetypes.Capability]{} + jobToSet := map[types.JobType]*util.Set[types.Capability]{} for _, actor := range apify.Actors { // Web requires a valid Gemini API key - if actor.JobType == teetypes.WebJob && !hasLLMKey { + if actor.JobType == types.WebJob && !hasLLMKey { logrus.Debug("Skipping Web actor due to missing Gemini key") continue } if ok, _ := c.ProbeActorAccess(actor.ActorId, actor.DefaultInput); ok { if _, exists := jobToSet[actor.JobType]; !exists { - jobToSet[actor.JobType] = util.NewSet[teetypes.Capability]() + jobToSet[actor.JobType] = util.NewSet[types.Capability]() } jobToSet[actor.JobType].Add(actor.Capabilities...) } else { @@ -97,27 +96,27 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface // Add general TwitterJob capability if any Twitter auth is available // TODO: this will get cleaned up with unique twitter capabilities if hasAccounts || hasApiKeys || hasApifyKey { - var twitterJobCaps []teetypes.Capability + var twitterJobCaps []types.Capability // Use the most comprehensive capabilities available if hasAccounts { - twitterJobCaps = teetypes.TwitterCredentialCaps + twitterJobCaps = types.TwitterCredentialCaps } else { // Use API capabilities if we only have keys - twitterJobCaps = make([]teetypes.Capability, len(teetypes.TwitterAPICaps)) - copy(twitterJobCaps, teetypes.TwitterAPICaps) + twitterJobCaps = make([]types.Capability, len(types.TwitterAPICaps)) + copy(twitterJobCaps, types.TwitterAPICaps) // Check for elevated API keys and add searchbyfullarchive capability if hasElevatedApiKey(apiKeys) { - twitterJobCaps = append(twitterJobCaps, teetypes.CapSearchByFullArchive) + twitterJobCaps = append(twitterJobCaps, types.CapSearchByFullArchive) } } // Add Apify capabilities if available if hasApifyKey { - twitterJobCaps = append(twitterJobCaps, teetypes.TwitterApifyCaps...) + twitterJobCaps = append(twitterJobCaps, types.TwitterApifyCaps...) } - capabilities[teetypes.TwitterJob] = twitterJobCaps + capabilities[types.TwitterJob] = twitterJobCaps } return capabilities diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index c4fd55c..376fc7d 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -7,23 +7,22 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - teetypes "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" ) // MockJobServer implements JobServerInterface for testing type MockJobServer struct { - capabilities teetypes.WorkerCapabilities + capabilities types.WorkerCapabilities } -func (m *MockJobServer) GetWorkerCapabilities() teetypes.WorkerCapabilities { +func (m *MockJobServer) GetWorkerCapabilities() types.WorkerCapabilities { return m.capabilities } var _ = Describe("DetectCapabilities", func() { DescribeTable("capability detection scenarios", - func(jc config.JobConfiguration, jobServer JobServerInterface, expected teetypes.WorkerCapabilities) { + func(jc config.JobConfiguration, jobServer JobServerInterface, expected types.WorkerCapabilities) { got := DetectCapabilities(jc, jobServer) // Extract job type keys and sort for consistent comparison @@ -47,24 +46,24 @@ var _ = Describe("DetectCapabilities", func() { Entry("With JobServer - performs real detection (JobServer ignored)", config.JobConfiguration{}, &MockJobServer{ - capabilities: teetypes.WorkerCapabilities{ - teetypes.WebJob: {teetypes.CapScraper}, - teetypes.TelemetryJob: {teetypes.CapTelemetry}, - teetypes.TiktokJob: {teetypes.CapTranscription}, - teetypes.TwitterJob: {teetypes.CapSearchByQuery, teetypes.CapGetById, teetypes.CapGetProfileById}, + capabilities: types.WorkerCapabilities{ + types.WebJob: {types.CapScraper}, + types.TelemetryJob: {types.CapTelemetry}, + types.TiktokJob: {types.CapTranscription}, + types.TwitterJob: {types.CapSearchByQuery, types.CapGetById, types.CapGetProfileById}, }, }, - teetypes.WorkerCapabilities{ - teetypes.TelemetryJob: {teetypes.CapTelemetry}, - teetypes.TiktokJob: {teetypes.CapTranscription}, + types.WorkerCapabilities{ + types.TelemetryJob: {types.CapTelemetry}, + types.TiktokJob: {types.CapTranscription}, }, ), Entry("Without JobServer - basic capabilities only", config.JobConfiguration{}, nil, - teetypes.WorkerCapabilities{ - teetypes.TelemetryJob: {teetypes.CapTelemetry}, - teetypes.TiktokJob: {teetypes.CapTranscription}, + types.WorkerCapabilities{ + types.TelemetryJob: {types.CapTelemetry}, + types.TiktokJob: {types.CapTranscription}, }, ), Entry("With Twitter accounts - adds credential capabilities", @@ -72,11 +71,11 @@ var _ = Describe("DetectCapabilities", func() { "twitter_accounts": []string{"account1", "account2"}, }, nil, - teetypes.WorkerCapabilities{ - teetypes.TelemetryJob: {teetypes.CapTelemetry}, - teetypes.TiktokJob: {teetypes.CapTranscription}, - teetypes.TwitterCredentialJob: teetypes.TwitterCredentialCaps, - teetypes.TwitterJob: teetypes.TwitterCredentialCaps, + types.WorkerCapabilities{ + types.TelemetryJob: {types.CapTelemetry}, + types.TiktokJob: {types.CapTranscription}, + types.TwitterCredentialJob: types.TwitterCredentialCaps, + types.TwitterJob: types.TwitterCredentialCaps, }, ), Entry("With Twitter API keys - adds API capabilities", @@ -84,11 +83,11 @@ var _ = Describe("DetectCapabilities", func() { "twitter_api_keys": []string{"key1", "key2"}, }, nil, - teetypes.WorkerCapabilities{ - teetypes.TelemetryJob: {teetypes.CapTelemetry}, - teetypes.TiktokJob: {teetypes.CapTranscription}, - teetypes.TwitterApiJob: teetypes.TwitterAPICaps, - teetypes.TwitterJob: teetypes.TwitterAPICaps, + types.WorkerCapabilities{ + types.TelemetryJob: {types.CapTelemetry}, + types.TiktokJob: {types.CapTranscription}, + types.TwitterApiJob: types.TwitterAPICaps, + types.TwitterJob: types.TwitterAPICaps, }, ), Entry("With mock elevated Twitter API keys - only basic capabilities detected", @@ -96,12 +95,12 @@ var _ = Describe("DetectCapabilities", func() { "twitter_api_keys": []string{"Bearer abcd1234-ELEVATED"}, }, nil, - teetypes.WorkerCapabilities{ - teetypes.TelemetryJob: {teetypes.CapTelemetry}, - teetypes.TiktokJob: {teetypes.CapTranscription}, + types.WorkerCapabilities{ + types.TelemetryJob: {types.CapTelemetry}, + types.TiktokJob: {types.CapTranscription}, // Note: Mock elevated keys will be detected as basic since we can't make real API calls in tests - teetypes.TwitterApiJob: teetypes.TwitterAPICaps, - teetypes.TwitterJob: teetypes.TwitterAPICaps, + types.TwitterApiJob: types.TwitterAPICaps, + types.TwitterJob: types.TwitterAPICaps, }, ), ) @@ -158,24 +157,24 @@ var _ = Describe("DetectCapabilities", func() { caps := DetectCapabilities(jc, nil) // TikTok should gain search capabilities with valid key - tiktokCaps, ok := caps[teetypes.TiktokJob] + tiktokCaps, ok := caps[types.TiktokJob] Expect(ok).To(BeTrue(), "expected tiktok capabilities to be present") - Expect(tiktokCaps).To(ContainElement(teetypes.CapSearchByQuery), "expected tiktok to include CapSearchByQuery capability") - Expect(tiktokCaps).To(ContainElement(teetypes.CapSearchByTrending), "expected tiktok to include CapSearchByTrending capability") + Expect(tiktokCaps).To(ContainElement(types.CapSearchByQuery), "expected tiktok to include CapSearchByQuery capability") + Expect(tiktokCaps).To(ContainElement(types.CapSearchByTrending), "expected tiktok to include CapSearchByTrending capability") // Twitter-Apify job should be present with follower/following capabilities - twitterApifyCaps, ok := caps[teetypes.TwitterApifyJob] + twitterApifyCaps, ok := caps[types.TwitterApifyJob] Expect(ok).To(BeTrue(), "expected twitter-apify capabilities to be present") - Expect(twitterApifyCaps).To(ContainElement(teetypes.CapGetFollowers), "expected twitter-apify to include CapGetFollowers capability") - Expect(twitterApifyCaps).To(ContainElement(teetypes.CapGetFollowing), "expected twitter-apify to include CapGetFollowing capability") + Expect(twitterApifyCaps).To(ContainElement(types.CapGetFollowers), "expected twitter-apify to include CapGetFollowers capability") + Expect(twitterApifyCaps).To(ContainElement(types.CapGetFollowing), "expected twitter-apify to include CapGetFollowing capability") // Reddit should be present (only if rented!) - redditCaps, hasReddit := caps[teetypes.RedditJob] + redditCaps, hasReddit := caps[types.RedditJob] Expect(hasReddit).To(BeTrue(), "expected reddit capabilities to be present") - Expect(redditCaps).To(ContainElement(teetypes.CapScrapeUrls), "expected reddit to include CapScrapeUrls capability") - Expect(redditCaps).To(ContainElement(teetypes.CapSearchPosts), "expected reddit to include CapSearchPosts capability") - Expect(redditCaps).To(ContainElement(teetypes.CapSearchUsers), "expected reddit to include CapSearchUsers capability") - Expect(redditCaps).To(ContainElement(teetypes.CapSearchCommunities), "expected reddit to include CapSearchCommunities capability") + Expect(redditCaps).To(ContainElement(types.CapScrapeUrls), "expected reddit to include CapScrapeUrls capability") + Expect(redditCaps).To(ContainElement(types.CapSearchPosts), "expected reddit to include CapSearchPosts capability") + Expect(redditCaps).To(ContainElement(types.CapSearchUsers), "expected reddit to include CapSearchUsers capability") + Expect(redditCaps).To(ContainElement(types.CapSearchCommunities), "expected reddit to include CapSearchCommunities capability") }) It("should add enhanced capabilities when valid Apify API key is provided alongside a Gemini API key", func() { apifyKey := os.Getenv("APIFY_API_KEY") @@ -195,15 +194,15 @@ var _ = Describe("DetectCapabilities", func() { caps := DetectCapabilities(jc, nil) // Web should be present - webCaps, hasWeb := caps[teetypes.WebJob] + webCaps, hasWeb := caps[types.WebJob] Expect(hasWeb).To(BeTrue(), "expected web capabilities to be present") - Expect(webCaps).To(ContainElement(teetypes.CapScraper), "expected web to include CapScraper capability") + Expect(webCaps).To(ContainElement(types.CapScraper), "expected web to include CapScraper capability") }) }) }) // Helper function to check if a job type exists in capabilities -func hasJobType(capabilities teetypes.WorkerCapabilities, jobName string) bool { - _, exists := capabilities[teetypes.JobType(jobName)] +func hasJobType(capabilities types.WorkerCapabilities, jobName string) bool { + _, exists := capabilities[types.JobType(jobName)] return exists } diff --git a/internal/config/config.go b/internal/config/config.go index 4aec428..1be62ab 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,8 +11,9 @@ import ( "time" "github.com/joho/godotenv" - teeargs "github.com/masa-finance/tee-worker/api/args" "github.com/sirupsen/logrus" + + "github.com/masa-finance/tee-worker/api/args" ) var ( @@ -346,9 +347,9 @@ type LlmConfig struct { // GetModelAndKey returns the first available model and API key based on which keys are valid func (lc LlmConfig) GetModelAndKey() (model string, key string, err error) { if lc.ClaudeApiKey.IsValid() { - return teeargs.LLMDefaultClaudeModel, string(lc.ClaudeApiKey), nil + return args.LLMDefaultClaudeModel, string(lc.ClaudeApiKey), nil } else if lc.GeminiApiKey.IsValid() { - return teeargs.LLMDefaultGeminiModel, string(lc.GeminiApiKey), nil + return args.LLMDefaultGeminiModel, string(lc.GeminiApiKey), nil } return "", "", errors.New("no valid llm api key found") } diff --git a/internal/jobs/linkedin.go b/internal/jobs/linkedin.go index 05a3a34..83d30ea 100644 --- a/internal/jobs/linkedin.go +++ b/internal/jobs/linkedin.go @@ -13,9 +13,7 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" - teetypes "github.com/masa-finance/tee-worker/api/types" profileTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" ) @@ -34,7 +32,7 @@ var NewLinkedInApifyClient = func(apiKey string, statsCollector *stats.StatsColl type LinkedInScraper struct { configuration config.JobConfiguration statsCollector *stats.StatsCollector - capabilities []teetypes.Capability + capabilities []types.Capability } func NewLinkedInScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollector) *LinkedInScraper { @@ -42,7 +40,7 @@ func NewLinkedInScraper(jc config.JobConfiguration, statsCollector *stats.StatsC return &LinkedInScraper{ configuration: jc, statsCollector: statsCollector, - capabilities: teetypes.LinkedInCaps, + capabilities: types.LinkedInCaps, } } @@ -56,7 +54,7 @@ func (ls *LinkedInScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: msg.Error()}, msg } - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { msg := fmt.Errorf("failed to unmarshal job arguments: %w", err) return types.JobResult{Error: msg.Error()}, msg @@ -96,12 +94,12 @@ func (ls *LinkedInScraper) ExecuteJob(j types.Job) (types.JobResult, error) { // GetStructuredCapabilities returns the structured capabilities supported by the LinkedIn scraper // based on the available credentials and API keys -func (ls *LinkedInScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { - capabilities := make(teetypes.WorkerCapabilities) +func (ls *LinkedInScraper) GetStructuredCapabilities() types.WorkerCapabilities { + capabilities := make(types.WorkerCapabilities) apifyApiKey := ls.configuration.GetString("apify_api_key", "") if apifyApiKey != "" { - capabilities[teetypes.LinkedInJob] = teetypes.LinkedInCaps + capabilities[types.LinkedInJob] = types.LinkedInCaps } return capabilities diff --git a/internal/jobs/linkedin_test.go b/internal/jobs/linkedin_test.go index 3f50c8f..634e8da 100644 --- a/internal/jobs/linkedin_test.go +++ b/internal/jobs/linkedin_test.go @@ -18,7 +18,6 @@ import ( "github.com/masa-finance/tee-worker/pkg/client" profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" - teetypes "github.com/masa-finance/tee-worker/api/types" profileTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" ) @@ -68,7 +67,7 @@ var _ = Describe("LinkedInScraper", func() { job = types.Job{ UUID: "test-uuid", - Type: teetypes.LinkedInJob, + Type: types.LinkedInJob, } }) @@ -89,7 +88,7 @@ var _ = Describe("LinkedInScraper", func() { scraper = jobs.NewLinkedInScraper(cfg, statsCollector) job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "software engineer", "maxItems": 10, } @@ -101,7 +100,7 @@ var _ = Describe("LinkedInScraper", func() { It("should call SearchProfiles and return data and next cursor", func() { job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "software engineer", "maxItems": 10, } @@ -131,7 +130,7 @@ var _ = Describe("LinkedInScraper", func() { Expect(workerID).To(Equal("test-worker")) Expect(args.Query).To(Equal("software engineer")) Expect(args.MaxItems).To(Equal(uint(10))) - Expect(args.QueryType).To(Equal(teetypes.CapSearchByProfile)) + Expect(args.QueryType).To(Equal(types.CapSearchByProfile)) return expectedProfiles, "dataset-123", client.Cursor("next-cursor"), nil } @@ -152,7 +151,7 @@ var _ = Describe("LinkedInScraper", func() { It("should handle errors from the LinkedIn client", func() { job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "software engineer", "maxItems": 10, } @@ -173,7 +172,7 @@ var _ = Describe("LinkedInScraper", func() { return nil, errors.New("client creation failed") } job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "software engineer", "maxItems": 10, } @@ -185,7 +184,7 @@ var _ = Describe("LinkedInScraper", func() { It("should return an error when dataset ID is missing", func() { job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "software engineer", "maxItems": 10, } @@ -201,7 +200,7 @@ var _ = Describe("LinkedInScraper", func() { It("should handle JSON marshalling errors", func() { job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "software engineer", "maxItems": 10, } @@ -225,7 +224,7 @@ var _ = Describe("LinkedInScraper", func() { It("should handle empty profile results", func() { job.Arguments = map[string]any{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "searchQuery": "nonexistent", "maxItems": 10, } @@ -253,8 +252,8 @@ var _ = Describe("LinkedInScraper", func() { scraper = jobs.NewLinkedInScraper(cfg, statsCollector) capabilities := scraper.GetStructuredCapabilities() - Expect(capabilities).To(HaveKey(teetypes.LinkedInJob)) - Expect(capabilities[teetypes.LinkedInJob]).To(ContainElement(teetypes.CapSearchByProfile)) + Expect(capabilities).To(HaveKey(types.LinkedInJob)) + Expect(capabilities[types.LinkedInJob]).To(ContainElement(types.CapSearchByProfile)) }) It("should return empty capabilities when Apify API key is missing", func() { @@ -262,7 +261,7 @@ var _ = Describe("LinkedInScraper", func() { scraper = jobs.NewLinkedInScraper(cfg, statsCollector) capabilities := scraper.GetStructuredCapabilities() - Expect(capabilities).NotTo(HaveKey(teetypes.LinkedInJob)) + Expect(capabilities).NotTo(HaveKey(types.LinkedInJob)) }) It("should return empty capabilities when Apify API key is empty", func() { @@ -272,7 +271,7 @@ var _ = Describe("LinkedInScraper", func() { scraper = jobs.NewLinkedInScraper(cfg, statsCollector) capabilities := scraper.GetStructuredCapabilities() - Expect(capabilities).NotTo(HaveKey(teetypes.LinkedInJob)) + Expect(capabilities).NotTo(HaveKey(types.LinkedInJob)) }) }) @@ -301,7 +300,7 @@ var _ = Describe("LinkedInScraper", func() { integrationScraper := jobs.NewLinkedInScraper(cfg, integrationStatsCollector) jobArgs := profileArgs.Arguments{ - QueryType: teetypes.CapSearchByProfile, + QueryType: types.CapSearchByProfile, Query: "software engineer", MaxItems: 10, } @@ -315,7 +314,7 @@ var _ = Describe("LinkedInScraper", func() { job := types.Job{ UUID: "integration-test-uuid", - Type: teetypes.LinkedInJob, + Type: types.LinkedInJob, WorkerID: "test-worker", Arguments: jobArgsMap, Timeout: 60 * time.Second, @@ -347,11 +346,11 @@ var _ = Describe("LinkedInScraper", func() { caps := integrationScraper.GetStructuredCapabilities() if apifyKey != "" { - Expect(caps[teetypes.LinkedInJob]).NotTo(BeEmpty()) - Expect(caps[teetypes.LinkedInJob]).To(ContainElement(teetypes.CapSearchByProfile)) + Expect(caps[types.LinkedInJob]).NotTo(BeEmpty()) + Expect(caps[types.LinkedInJob]).To(ContainElement(types.CapSearchByProfile)) } else { // Expect no capabilities when key is missing - _, ok := caps[teetypes.LinkedInJob] + _, ok := caps[types.LinkedInJob] Expect(ok).To(BeFalse()) } }) diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index 37b0448..364a507 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -5,8 +5,6 @@ import ( "errors" "fmt" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -54,7 +52,7 @@ func (c *ApifyClient) ValidateApiKey() error { return c.client.ValidateApiKey() } -func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { +func (c *ApifyClient) Process(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.LLMQueries, 1) } @@ -78,10 +76,10 @@ func (c *ApifyClient) Process(workerID string, args teeargs.LLMProcessorArgument return nil, client.EmptyCursor, err } - response := make([]*teetypes.LLMProcessorResult, 0, len(dataset.Data.Items)) + response := make([]*types.LLMProcessorResult, 0, len(dataset.Data.Items)) for i, item := range dataset.Data.Items { - var resp teetypes.LLMProcessorResult + var resp types.LLMProcessorResult if err := json.Unmarshal(item, &resp); err != nil { logrus.Warnf("Failed to unmarshal llm result at index %d: %v", i, err) continue diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 9479cd7..9a85f8a 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -15,8 +15,6 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/llmapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockApifyClient is a mock implementation of the ApifyClient. @@ -68,7 +66,7 @@ var _ = Describe("LLMApifyClient", func() { Describe("Process", func() { It("should construct the correct actor input", func() { - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -84,15 +82,15 @@ var _ = Describe("LLMApifyClient", func() { Expect(limit).To(Equal(uint(1))) // Verify the input is correctly converted to LLMProcessorRequest - request, ok := input.(teetypes.LLMProcessorRequest) + request, ok := input.(types.LLMProcessorRequest) Expect(ok).To(BeTrue()) Expect(request.InputDatasetId).To(Equal("test-dataset-id")) Expect(request.Prompt).To(Equal("test-prompt")) Expect(request.LLMProviderApiKey).To(Equal("test-claude-llm-key")) // should be set from constructor - Expect(request.Model).To(Equal(teeargs.LLMDefaultClaudeModel)) // default model - Expect(request.MultipleColumns).To(Equal(teeargs.LLMDefaultMultipleColumns)) // default value - Expect(request.MaxTokens).To(Equal(teeargs.LLMDefaultMaxTokens)) // default value - Expect(request.Temperature).To(Equal(strconv.FormatFloat(teeargs.LLMDefaultTemperature, 'f', -1, 64))) // default value + Expect(request.Model).To(Equal(args.LLMDefaultClaudeModel)) // default model + Expect(request.MultipleColumns).To(Equal(args.LLMDefaultMultipleColumns)) // default value + Expect(request.MaxTokens).To(Equal(args.LLMDefaultMaxTokens)) // default value + Expect(request.Temperature).To(Equal(strconv.FormatFloat(args.LLMDefaultTemperature, 'f', -1, 64))) // default value return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } @@ -107,7 +105,7 @@ var _ = Describe("LLMApifyClient", func() { return nil, "", expectedErr } - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -126,7 +124,7 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -148,7 +146,7 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -175,7 +173,7 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -187,7 +185,7 @@ var _ = Describe("LLMApifyClient", func() { }) It("should use custom values when provided", func() { - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", MaxTokens: 500, @@ -195,7 +193,7 @@ var _ = Describe("LLMApifyClient", func() { } mockClient.RunActorAndGetResponseFunc = func(actorID apify.ActorId, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { - request, ok := input.(teetypes.LLMProcessorRequest) + request, ok := input.(types.LLMProcessorRequest) Expect(ok).To(BeTrue()) Expect(request.MaxTokens).To(Equal(uint(500))) Expect(request.Temperature).To(Equal("0.5")) @@ -258,7 +256,7 @@ var _ = Describe("LLMApifyClient", func() { realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: config.LlmApiKey(geminiKey)}, nil) Expect(err).NotTo(HaveOccurred()) - args := teeargs.LLMProcessorArguments{ + args := args.LLMProcessorArguments{ DatasetId: "V6tyuuZIgfiETl1cl", Prompt: "summarize the content of this webpage ${markdown}", } diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index 937f069..7427332 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -16,14 +16,12 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // RedditApifyClient defines the interface for the Reddit Apify client. // This allows for mocking in tests. type RedditApifyClient interface { - ScrapeUrls(workerID string, urls []teetypes.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) + ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) SearchPosts(workerID string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) SearchCommunities(workerID string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) SearchUsers(workerID string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) @@ -38,7 +36,7 @@ var NewRedditApifyClient = func(apiKey string, statsCollector *stats.StatsCollec type RedditScraper struct { configuration config.RedditConfig statsCollector *stats.StatsCollector - capabilities []teetypes.Capability + capabilities []types.Capability } func NewRedditScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollector) *RedditScraper { @@ -47,21 +45,21 @@ func NewRedditScraper(jc config.JobConfiguration, statsCollector *stats.StatsCol return &RedditScraper{ configuration: config, statsCollector: statsCollector, - capabilities: teetypes.RedditCaps, + capabilities: types.RedditCaps, } } func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for Reddit scrape") - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { msg := fmt.Errorf("failed to unmarshal job arguments: %w", err) return types.JobResult{Error: msg.Error()}, msg } // Type assert to Reddit arguments - redditArgs, ok := jobArgs.(*teeargs.RedditArguments) + redditArgs, ok := jobArgs.(*args.RedditArguments) if !ok { return types.JobResult{Error: "invalid argument type for Reddit job"}, errors.New("invalid argument type") } @@ -76,10 +74,10 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { commonArgs.CopyFromArgs(redditArgs) switch redditArgs.QueryType { - case teetypes.RedditScrapeUrls: - urls := make([]teetypes.RedditStartURL, 0, len(redditArgs.URLs)) + case types.RedditScrapeUrls: + urls := make([]types.RedditStartURL, 0, len(redditArgs.URLs)) for _, u := range redditArgs.URLs { - urls = append(urls, teetypes.RedditStartURL{ + urls = append(urls, types.RedditStartURL{ URL: u, Method: "GET", }) @@ -88,15 +86,15 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { resp, cursor, err := redditClient.ScrapeUrls(j.WorkerID, urls, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) - case teetypes.RedditSearchUsers: + case types.RedditSearchUsers: resp, cursor, err := redditClient.SearchUsers(j.WorkerID, redditArgs.Queries, redditArgs.SkipPosts, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) - case teetypes.RedditSearchPosts: + case types.RedditSearchPosts: resp, cursor, err := redditClient.SearchPosts(j.WorkerID, redditArgs.Queries, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) - case teetypes.RedditSearchCommunities: + case types.RedditSearchCommunities: resp, cursor, err := redditClient.SearchCommunities(j.WorkerID, redditArgs.Queries, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) @@ -123,13 +121,13 @@ func processRedditResponse(j types.Job, resp []*reddit.Response, cursor client.C // GetStructuredCapabilities returns the structured capabilities supported by this Twitter scraper // based on the available credentials and API keys -func (rs *RedditScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { - capabilities := make(teetypes.WorkerCapabilities) +func (rs *RedditScraper) GetStructuredCapabilities() types.WorkerCapabilities { + capabilities := make(types.WorkerCapabilities) // Add Apify-specific capabilities based on available API key // TODO: We should verify whether each of the actors is actually available through this API key if rs.configuration.ApifyApiKey != "" { - capabilities[teetypes.RedditJob] = teetypes.RedditCaps + capabilities[types.RedditJob] = types.RedditCaps } return capabilities diff --git a/internal/jobs/reddit_test.go b/internal/jobs/reddit_test.go index ebc1598..e326ed8 100644 --- a/internal/jobs/reddit_test.go +++ b/internal/jobs/reddit_test.go @@ -17,18 +17,17 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockRedditApifyClient is a mock implementation of the RedditApifyClient. type MockRedditApifyClient struct { - ScrapeUrlsFunc func(urls []teetypes.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) + ScrapeUrlsFunc func(urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) SearchPostsFunc func(queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) SearchCommunitiesFunc func(queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) SearchUsersFunc func(queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) } -func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []teetypes.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { if m != nil && m.ScrapeUrlsFunc != nil { res, cursor, err := m.ScrapeUrlsFunc(urls, after, args, cursor, maxResults) for i, r := range res { @@ -83,7 +82,7 @@ var _ = Describe("RedditScraper", func() { job = types.Job{ UUID: "test-uuid", - Type: teetypes.RedditJob, + Type: types.RedditJob, } }) @@ -100,11 +99,11 @@ var _ = Describe("RedditScraper", func() { "https://www.reddit.com/r/HHGTTG/comments/1jynlrz/the_entire_series_after_restaurant_at_the_end_of/", } job.Arguments = map[string]any{ - "type": teetypes.RedditScrapeUrls, + "type": types.RedditScrapeUrls, "urls": testUrls, } - mockClient.ScrapeUrlsFunc = func(urls []teetypes.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { + mockClient.ScrapeUrlsFunc = func(urls []types.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { Expect(urls).To(HaveLen(1)) Expect(urls[0].URL).To(Equal(testUrls[0])) return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.UserResponse}, User: &reddit.User{ID: "user1", DataType: string(reddit.UserResponse)}}}, "next", nil @@ -124,7 +123,7 @@ var _ = Describe("RedditScraper", func() { It("should call SearchUsers for the correct QueryType", func() { job.Arguments = map[string]any{ - "type": teetypes.RedditSearchUsers, + "type": types.RedditSearchUsers, "queries": []string{"user-query"}, } @@ -147,7 +146,7 @@ var _ = Describe("RedditScraper", func() { It("should call SearchPosts for the correct QueryType", func() { job.Arguments = map[string]any{ - "type": teetypes.RedditSearchPosts, + "type": types.RedditSearchPosts, "queries": []string{"post-query"}, } @@ -170,7 +169,7 @@ var _ = Describe("RedditScraper", func() { It("should call SearchCommunities for the correct QueryType", func() { job.Arguments = map[string]any{ - "type": teetypes.RedditSearchCommunities, + "type": types.RedditSearchCommunities, "queries": []string{"community-query"}, } @@ -204,7 +203,7 @@ var _ = Describe("RedditScraper", func() { It("should handle errors from the reddit client", func() { job.Arguments = map[string]any{ - "type": teetypes.RedditSearchPosts, + "type": types.RedditSearchPosts, "queries": []string{"post-query"}, } @@ -224,7 +223,7 @@ var _ = Describe("RedditScraper", func() { return nil, errors.New("client creation failed") } job.Arguments = map[string]any{ - "type": teetypes.RedditSearchPosts, + "type": types.RedditSearchPosts, "queries": []string{"post-query"}, } diff --git a/internal/jobs/redditapify/client.go b/internal/jobs/redditapify/client.go index 008795a..07dc090 100644 --- a/internal/jobs/redditapify/client.go +++ b/internal/jobs/redditapify/client.go @@ -7,18 +7,17 @@ import ( "github.com/sirupsen/logrus" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // CommonArgs holds the parameters that all Reddit searches support, in a single struct type CommonArgs struct { - Sort teetypes.RedditSortType + Sort types.RedditSortType IncludeNSFW bool MaxItems uint MaxPosts uint @@ -27,7 +26,7 @@ type CommonArgs struct { MaxUsers uint } -func (ca *CommonArgs) CopyFromArgs(a *teeargs.RedditArguments) { +func (ca *CommonArgs) CopyFromArgs(a *args.RedditArguments) { ca.Sort = a.Sort ca.IncludeNSFW = a.IncludeNSFW ca.MaxItems = a.MaxItems @@ -52,23 +51,23 @@ func (args *CommonArgs) ToActorRequest() RedditActorRequest { // RedditActorRequest represents the query parameters for the Apify Reddit Scraper actor. // Based on the input schema of https://apify.com/trudax/reddit-scraper type RedditActorRequest struct { - Type teetypes.RedditQueryType `json:"type,omitempty"` - Searches []string `json:"searches,omitempty"` - StartUrls []teetypes.RedditStartURL `json:"startUrls,omitempty"` - Sort teetypes.RedditSortType `json:"sort,omitempty"` - PostDateLimit *time.Time `json:"postDateLimit,omitempty"` - IncludeNSFW bool `json:"includeNSFW"` - MaxItems uint `json:"maxItems,omitempty"` // Total number of items to scrape - MaxPostCount uint `json:"maxPostCount,omitempty"` // Max number of posts per page - MaxComments uint `json:"maxComments,omitempty"` // Max number of comments per page - MaxCommunitiesCount uint `json:"maxCommunitiesCount,omitempty"` // Max number of communities per page - MaxUserCount uint `json:"maxUserCount,omitempty"` // Max number of users per page - SearchComments bool `json:"searchComments"` - SearchCommunities bool `json:"searchCommunities"` - SearchPosts bool `json:"searchPosts"` - SearchUsers bool `json:"searchUsers"` - SkipUserPosts bool `json:"skipUserPosts"` - SkipComments bool `json:"skipComments"` + Type types.RedditQueryType `json:"type,omitempty"` + Searches []string `json:"searches,omitempty"` + StartUrls []types.RedditStartURL `json:"startUrls,omitempty"` + Sort types.RedditSortType `json:"sort,omitempty"` + PostDateLimit *time.Time `json:"postDateLimit,omitempty"` + IncludeNSFW bool `json:"includeNSFW"` + MaxItems uint `json:"maxItems,omitempty"` // Total number of items to scrape + MaxPostCount uint `json:"maxPostCount,omitempty"` // Max number of posts per page + MaxComments uint `json:"maxComments,omitempty"` // Max number of comments per page + MaxCommunitiesCount uint `json:"maxCommunitiesCount,omitempty"` // Max number of communities per page + MaxUserCount uint `json:"maxUserCount,omitempty"` // Max number of users per page + SearchComments bool `json:"searchComments"` + SearchCommunities bool `json:"searchCommunities"` + SearchPosts bool `json:"searchPosts"` + SearchUsers bool `json:"searchUsers"` + SkipUserPosts bool `json:"skipUserPosts"` + SkipComments bool `json:"skipComments"` } // RedditApifyClient wraps the generic Apify client for Reddit-specific operations @@ -102,7 +101,7 @@ func (c *RedditApifyClient) ValidateApiKey() error { } // ScrapeUrls scrapes Reddit URLs -func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []teetypes.RedditStartURL, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { input := args.ToActorRequest() input.StartUrls = urls input.Searches = nil diff --git a/internal/jobs/redditapify/client_test.go b/internal/jobs/redditapify/client_test.go index c4bea35..c68577b 100644 --- a/internal/jobs/redditapify/client_test.go +++ b/internal/jobs/redditapify/client_test.go @@ -12,8 +12,6 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockApifyClient is a mock implementation of the ApifyClient. @@ -63,7 +61,7 @@ var _ = Describe("RedditApifyClient", func() { Describe("ScrapeUrls", func() { It("should construct the correct actor input", func() { - urls := []teetypes.RedditStartURL{{URL: "http://reddit.com/r/golang"}} + urls := []types.RedditStartURL{{URL: "http://reddit.com/r/golang"}} after := time.Now() args := redditapify.CommonArgs{MaxPosts: 10} @@ -98,7 +96,7 @@ var _ = Describe("RedditApifyClient", func() { Expect(req.Searches).To(Equal(queries)) Expect(req.StartUrls).To(BeNil()) Expect(*req.PostDateLimit).To(BeTemporally("~", after, time.Second)) - Expect(req.Type).To(Equal(teetypes.RedditQueryType("posts"))) + Expect(req.Type).To(Equal(types.RedditQueryType("posts"))) Expect(req.SearchPosts).To(BeTrue()) Expect(req.SkipComments).To(BeFalse()) Expect(req.MaxComments).To(Equal(uint(5))) @@ -120,7 +118,7 @@ var _ = Describe("RedditApifyClient", func() { req := input.(redditapify.RedditActorRequest) Expect(req.Searches).To(Equal(queries)) Expect(req.StartUrls).To(BeNil()) - Expect(req.Type).To(Equal(teetypes.RedditQueryType("communities"))) + Expect(req.Type).To(Equal(types.RedditQueryType("communities"))) Expect(req.SearchCommunities).To(BeTrue()) return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } @@ -140,7 +138,7 @@ var _ = Describe("RedditApifyClient", func() { req := input.(redditapify.RedditActorRequest) Expect(req.Searches).To(Equal(queries)) Expect(req.StartUrls).To(BeNil()) - Expect(req.Type).To(Equal(teetypes.RedditQueryType("users"))) + Expect(req.Type).To(Equal(types.RedditQueryType("users"))) Expect(req.SearchUsers).To(BeTrue()) Expect(req.SkipUserPosts).To(BeTrue()) return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil @@ -201,8 +199,8 @@ var _ = Describe("RedditApifyClient", func() { Describe("CommonArgs", func() { It("should copy from RedditArguments correctly", func() { - redditArgs := &teeargs.RedditArguments{ - Sort: teetypes.RedditSortTop, + redditArgs := &args.RedditArguments{ + Sort: types.RedditSortTop, IncludeNSFW: true, MaxItems: 1, MaxPosts: 2, @@ -213,7 +211,7 @@ var _ = Describe("RedditApifyClient", func() { commonArgs := redditapify.CommonArgs{} commonArgs.CopyFromArgs(redditArgs) - Expect(commonArgs.Sort).To(Equal(teetypes.RedditSortTop)) + Expect(commonArgs.Sort).To(Equal(types.RedditSortTop)) Expect(commonArgs.IncludeNSFW).To(BeTrue()) Expect(commonArgs.MaxItems).To(Equal(uint(1))) Expect(commonArgs.MaxPosts).To(Equal(uint(2))) @@ -224,7 +222,7 @@ var _ = Describe("RedditApifyClient", func() { It("should convert to RedditActorRequest correctly", func() { commonArgs := redditapify.CommonArgs{ - Sort: teetypes.RedditSortNew, + Sort: types.RedditSortNew, IncludeNSFW: true, MaxItems: 10, MaxPosts: 20, @@ -234,7 +232,7 @@ var _ = Describe("RedditApifyClient", func() { } actorReq := commonArgs.ToActorRequest() - Expect(actorReq.Sort).To(Equal(teetypes.RedditSortNew)) + Expect(actorReq.Sort).To(Equal(types.RedditSortNew)) Expect(actorReq.IncludeNSFW).To(BeTrue()) Expect(actorReq.MaxItems).To(Equal(uint(10))) Expect(actorReq.MaxPostCount).To(Equal(uint(20))) diff --git a/internal/jobs/stats/stats.go b/internal/jobs/stats/stats.go index 8aaeab8..27cd6cb 100644 --- a/internal/jobs/stats/stats.go +++ b/internal/jobs/stats/stats.go @@ -5,7 +5,6 @@ import ( "sync" "time" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/versioning" @@ -14,7 +13,7 @@ import ( // WorkerCapabilitiesProvider abstracts capability retrieval to avoid import cycles type WorkerCapabilitiesProvider interface { - GetWorkerCapabilities() teetypes.WorkerCapabilities + GetWorkerCapabilities() types.WorkerCapabilities } // These are the types of statistics that we can add. The value is the JSON key that will be used for serialization. @@ -66,7 +65,7 @@ type Stats struct { CurrentTimeUnix int64 `json:"current_time"` WorkerID string `json:"worker_id"` Stats map[string]map[StatType]uint `json:"stats"` - ReportedCapabilities teetypes.WorkerCapabilities `json:"reported_capabilities"` + ReportedCapabilities types.WorkerCapabilities `json:"reported_capabilities"` WorkerVersion string `json:"worker_version"` ApplicationVersion string `json:"application_version"` sync.Mutex diff --git a/internal/jobs/telemetry.go b/internal/jobs/telemetry.go index aae760b..0ad9425 100644 --- a/internal/jobs/telemetry.go +++ b/internal/jobs/telemetry.go @@ -1,7 +1,6 @@ package jobs import ( - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -17,9 +16,9 @@ func NewTelemetryJob(jc config.JobConfiguration, c *stats.StatsCollector) Teleme } // GetStructuredCapabilities returns the structured capabilities supported by the telemetry job -func (t TelemetryJob) GetStructuredCapabilities() teetypes.WorkerCapabilities { - return teetypes.WorkerCapabilities{ - teetypes.TelemetryJob: teetypes.AlwaysAvailableTelemetryCaps, +func (t TelemetryJob) GetStructuredCapabilities() types.WorkerCapabilities { + return types.WorkerCapabilities{ + types.TelemetryJob: types.AlwaysAvailableTelemetryCaps, } } diff --git a/internal/jobs/telemetry_test.go b/internal/jobs/telemetry_test.go index 9d877c0..f869597 100644 --- a/internal/jobs/telemetry_test.go +++ b/internal/jobs/telemetry_test.go @@ -8,7 +8,6 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" . "github.com/masa-finance/tee-worker/internal/jobs" @@ -41,7 +40,7 @@ var _ = Describe("Telemetry Job", func() { // Execute the telemetry job job := types.Job{ - Type: teetypes.TelemetryJob, + Type: types.TelemetryJob, WorkerID: "telemetry-test", } @@ -87,7 +86,7 @@ var _ = Describe("Telemetry Job", func() { telemetryJobNoStats := NewTelemetryJob(config.JobConfiguration{}, nil) job := types.Job{ - Type: teetypes.TelemetryJob, + Type: types.TelemetryJob, WorkerID: "telemetry-test-no-stats", } @@ -105,7 +104,7 @@ var _ = Describe("Telemetry Job", func() { Expect(capabilities).NotTo(BeEmpty()) Expect(capabilities).To(HaveLen(1)) - Expect(capabilities[teetypes.TelemetryJob]).To(ContainElement(teetypes.CapTelemetry)) + Expect(capabilities[types.TelemetryJob]).To(ContainElement(types.CapTelemetry)) logrus.WithField("capabilities", capabilities).Info("Telemetry job capabilities verified") }) diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index fad73bb..4c9002c 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -10,8 +10,6 @@ import ( "strings" "time" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -42,14 +40,14 @@ type TikTokTranscriber struct { } // GetStructuredCapabilities returns the structured capabilities supported by the TikTok transcriber -func (t *TikTokTranscriber) GetStructuredCapabilities() teetypes.WorkerCapabilities { - caps := make([]teetypes.Capability, 0, len(teetypes.AlwaysAvailableTiktokCaps)+len(teetypes.TiktokSearchCaps)) - caps = append(caps, teetypes.AlwaysAvailableTiktokCaps...) +func (t *TikTokTranscriber) GetStructuredCapabilities() types.WorkerCapabilities { + caps := make([]types.Capability, 0, len(types.AlwaysAvailableTiktokCaps)+len(types.TiktokSearchCaps)) + caps = append(caps, types.AlwaysAvailableTiktokCaps...) if t.configuration.ApifyApiKey != "" { - caps = append(caps, teetypes.TiktokSearchCaps...) + caps = append(caps, types.TiktokSearchCaps...) } - return teetypes.WorkerCapabilities{ - teetypes.TiktokJob: caps, + return types.WorkerCapabilities{ + types.TiktokJob: caps, } } @@ -105,17 +103,17 @@ func (ttt *TikTokTranscriber) ExecuteJob(j types.Job) (types.JobResult, error) { logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for TikTok job") // Use the centralized type-safe unmarshaller - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { return types.JobResult{Error: "Failed to unmarshal job arguments"}, fmt.Errorf("unmarshal job arguments: %w", err) } // Branch by argument type (transcription vs search) - if transcriptionArgs, ok := jobArgs.(*teeargs.TikTokTranscriptionArguments); ok { + if transcriptionArgs, ok := jobArgs.(*args.TikTokTranscriptionArguments); ok { return ttt.executeTranscription(j, transcriptionArgs) - } else if searchByQueryArgs, ok := jobArgs.(*teeargs.TikTokSearchByQueryArguments); ok { + } else if searchByQueryArgs, ok := jobArgs.(*args.TikTokSearchByQueryArguments); ok { return ttt.executeSearchByQuery(j, searchByQueryArgs) - } else if searchByTrendingArgs, ok := jobArgs.(*teeargs.TikTokSearchByTrendingArguments); ok { + } else if searchByTrendingArgs, ok := jobArgs.(*args.TikTokSearchByTrendingArguments); ok { return ttt.executeSearchByTrending(j, searchByTrendingArgs) } else { return types.JobResult{Error: "invalid argument type for TikTok job"}, fmt.Errorf("invalid argument type") @@ -123,7 +121,7 @@ func (ttt *TikTokTranscriber) ExecuteJob(j types.Job) (types.JobResult, error) { } // executeTranscription calls the external transcription service and returns a normalized result -func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTokTranscriptionArguments) (types.JobResult, error) { +func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *args.TikTokTranscriptionArguments) (types.JobResult, error) { logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for TikTok transcription") if ttt.configuration.TranscriptionEndpoint == "" { @@ -132,13 +130,13 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo } // Use the centralized type-safe unmarshaller - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { return types.JobResult{Error: "Failed to unmarshal job arguments"}, fmt.Errorf("unmarshal job arguments: %w", err) } // Type assert to TikTok arguments - tiktokArgs, ok := jobArgs.(*teeargs.TikTokTranscriptionArguments) + tiktokArgs, ok := jobArgs.(*args.TikTokTranscriptionArguments) if !ok { return types.JobResult{Error: "invalid argument type for TikTok job"}, fmt.Errorf("invalid argument type") } @@ -270,7 +268,7 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo } // Process Result & Return - resultData := teetypes.TikTokTranscriptionResult{ + resultData := types.TikTokTranscriptionResult{ TranscriptionText: plainTextTranscription, DetectedLanguage: languageCode, VideoTitle: parsedAPIResponse.VideoTitle, @@ -294,7 +292,7 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *teeargs.TikTo } // executeSearchByQuery runs the epctex/tiktok-search-scraper actor and returns results -func (ttt *TikTokTranscriber) executeSearchByQuery(j types.Job, a *teeargs.TikTokSearchByQueryArguments) (types.JobResult, error) { +func (ttt *TikTokTranscriber) executeSearchByQuery(j types.Job, a *args.TikTokSearchByQueryArguments) (types.JobResult, error) { c, err := tiktokapify.NewTikTokApifyClient(ttt.configuration.ApifyApiKey) if err != nil { ttt.stats.Add(j.WorkerID, stats.TikTokAuthErrors, 1) @@ -325,7 +323,7 @@ func (ttt *TikTokTranscriber) executeSearchByQuery(j types.Job, a *teeargs.TikTo } // executeSearchByTrending runs the lexis-solutions/tiktok-trending-videos-scraper actor and returns results -func (ttt *TikTokTranscriber) executeSearchByTrending(j types.Job, a *teeargs.TikTokSearchByTrendingArguments) (types.JobResult, error) { +func (ttt *TikTokTranscriber) executeSearchByTrending(j types.Job, a *args.TikTokSearchByTrendingArguments) (types.JobResult, error) { c, err := tiktokapify.NewTikTokApifyClient(ttt.configuration.ApifyApiKey) if err != nil { ttt.stats.Add(j.WorkerID, stats.TikTokAuthErrors, 1) diff --git a/internal/jobs/tiktok_test.go b/internal/jobs/tiktok_test.go index 8ec3780..b488e06 100644 --- a/internal/jobs/tiktok_test.go +++ b/internal/jobs/tiktok_test.go @@ -10,7 +10,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" . "github.com/masa-finance/tee-worker/internal/jobs" @@ -46,12 +45,12 @@ var _ = Describe("TikTok", func() { It("should successfully transcribe the video and record success stats", func(ctx SpecContext) { videoURL := "https://www.tiktok.com/@theblockrunner.com/video/7227579907361066282" jobArguments := map[string]interface{}{ - "type": teetypes.CapTranscription, + "type": types.CapTranscription, "video_url": videoURL, } job := types.Job{ - Type: teetypes.TiktokJob, + Type: types.TiktokJob, Arguments: jobArguments, WorkerID: "tiktok-test-worker-happy", UUID: "test-uuid-happy", @@ -69,7 +68,7 @@ var _ = Describe("TikTok", func() { Expect(res.Data).NotTo(BeNil()) Expect(res.Data).NotTo(BeEmpty()) - var transcriptionResult teetypes.TikTokTranscriptionResult + var transcriptionResult types.TikTokTranscriptionResult err = json.Unmarshal(res.Data, &transcriptionResult) Expect(err).NotTo(HaveOccurred(), "Failed to unmarshal result data") @@ -116,12 +115,12 @@ var _ = Describe("TikTok", func() { Context("when arguments are invalid", func() { It("should return an error if VideoURL is empty and not record error stats", func() { jobArguments := map[string]interface{}{ - "type": teetypes.CapTranscription, + "type": types.CapTranscription, "video_url": "", // Empty URL } job := types.Job{ - Type: teetypes.TiktokJob, + Type: types.TiktokJob, Arguments: jobArguments, WorkerID: "tiktok-test-worker-invalid", UUID: "test-uuid-invalid", @@ -174,9 +173,9 @@ var _ = Describe("TikTok", func() { t := NewTikTokTranscriber(jobConfig, statsCollector) j := types.Job{ - Type: teetypes.TiktokJob, + Type: types.TiktokJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "search": []string{"crypto", "ai"}, "max_items": 5, "end_page": 1, @@ -190,7 +189,7 @@ var _ = Describe("TikTok", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var items []*teetypes.TikTokSearchByQueryResult + var items []*types.TikTokSearchByQueryResult err = json.Unmarshal(res.Data, &items) Expect(err).NotTo(HaveOccurred()) Expect(items).NotTo(BeEmpty()) @@ -235,9 +234,9 @@ var _ = Describe("TikTok", func() { t := NewTikTokTranscriber(jobConfig, statsCollector) j := types.Job{ - Type: teetypes.TiktokJob, + Type: types.TiktokJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByTrending, + "type": types.CapSearchByTrending, "country_code": "US", "sort_by": "repost", "max_items": 5, @@ -251,7 +250,7 @@ var _ = Describe("TikTok", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var items []*teetypes.TikTokSearchByTrending + var items []*types.TikTokSearchByTrending err = json.Unmarshal(res.Data, &items) Expect(err).NotTo(HaveOccurred()) Expect(items).NotTo(BeEmpty()) @@ -290,9 +289,9 @@ var _ = Describe("TikTok", func() { t := NewTikTokTranscriber(jobConfig, statsCollector) j := types.Job{ - Type: teetypes.TiktokJob, + Type: types.TiktokJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "search": []string{"tiktok"}, "max_items": 1, "end_page": 1, diff --git a/internal/jobs/tiktokapify/client.go b/internal/jobs/tiktokapify/client.go index 9fda59d..ff531a5 100644 --- a/internal/jobs/tiktokapify/client.go +++ b/internal/jobs/tiktokapify/client.go @@ -4,8 +4,6 @@ import ( "encoding/json" "fmt" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" ) @@ -43,7 +41,7 @@ func (c *TikTokApifyClient) ValidateApiKey() error { } // SearchByQuery runs the search actor and returns typed results -func (c *TikTokApifyClient) SearchByQuery(input teeargs.TikTokSearchByQueryArguments, cursor client.Cursor, limit uint) ([]*teetypes.TikTokSearchByQueryResult, client.Cursor, error) { +func (c *TikTokApifyClient) SearchByQuery(input args.TikTokSearchByQueryArguments, cursor client.Cursor, limit uint) ([]*types.TikTokSearchByQueryResult, client.Cursor, error) { // Map snake_case fields to Apify actor's expected camelCase input startUrls := input.StartUrls if startUrls == nil { @@ -79,9 +77,9 @@ func (c *TikTokApifyClient) SearchByQuery(input teeargs.TikTokSearchByQueryArgum return nil, "", fmt.Errorf("apify run (search): %w", err) } - var results []*teetypes.TikTokSearchByQueryResult + var results []*types.TikTokSearchByQueryResult for _, raw := range dataset.Data.Items { - var item teetypes.TikTokSearchByQueryResult + var item types.TikTokSearchByQueryResult if err := json.Unmarshal(raw, &item); err != nil { // Skip any items whose structure doesn't match continue @@ -92,7 +90,7 @@ func (c *TikTokApifyClient) SearchByQuery(input teeargs.TikTokSearchByQueryArgum } // SearchByTrending runs the trending actor and returns typed results -func (c *TikTokApifyClient) SearchByTrending(input teeargs.TikTokSearchByTrendingArguments, cursor client.Cursor, limit uint) ([]*teetypes.TikTokSearchByTrending, client.Cursor, error) { +func (c *TikTokApifyClient) SearchByTrending(input args.TikTokSearchByTrendingArguments, cursor client.Cursor, limit uint) ([]*types.TikTokSearchByTrending, client.Cursor, error) { request := TikTokSearchByTrendingRequest{ CountryCode: input.CountryCode, SortBy: input.SortBy, @@ -115,9 +113,9 @@ func (c *TikTokApifyClient) SearchByTrending(input teeargs.TikTokSearchByTrendin return nil, "", fmt.Errorf("apify run (trending): %w", err) } - var results []*teetypes.TikTokSearchByTrending + var results []*types.TikTokSearchByTrending for _, raw := range dataset.Data.Items { - var item teetypes.TikTokSearchByTrending + var item types.TikTokSearchByTrending if err := json.Unmarshal(raw, &item); err != nil { continue } diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index 793a567..13e2ea1 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -8,8 +8,6 @@ import ( "strings" "time" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/jobs/twitterx" "github.com/masa-finance/tee-worker/pkg/client" @@ -24,7 +22,7 @@ import ( "github.com/sirupsen/logrus" ) -func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitterscraper.Tweet) *teetypes.TweetResult { +func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitterscraper.Tweet) *types.TweetResult { id, err := strconv.ParseInt(tweet.ID, 10, 64) if err != nil { logrus.Warnf("failed to convert tweet ID to int64: %s", tweet.ID) @@ -34,7 +32,7 @@ func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitters createdAt := time.Unix(tweet.Timestamp, 0).UTC() logrus.Debug("Converting Tweet ID: ", id) // Changed to Debug - return &teetypes.TweetResult{ + return &types.TweetResult{ ID: id, TweetID: tweet.ID, ConversationID: tweet.ConversationID, @@ -54,20 +52,20 @@ func (ts *TwitterScraper) convertTwitterScraperTweetToTweetResult(tweet twitters Retweets: tweet.Retweets, URLs: tweet.URLs, Username: tweet.Username, - Photos: func() []teetypes.Photo { - var photos []teetypes.Photo + Photos: func() []types.Photo { + var photos []types.Photo for _, photo := range tweet.Photos { - photos = append(photos, teetypes.Photo{ + photos = append(photos, types.Photo{ ID: photo.ID, URL: photo.URL, }) } return photos }(), - Videos: func() []teetypes.Video { - var videos []teetypes.Video + Videos: func() []types.Video { + var videos []types.Video for _, video := range tweet.Videos { - videos = append(videos, teetypes.Video{ + videos = append(videos, types.Video{ ID: video.ID, Preview: video.Preview, URL: video.URL, @@ -233,15 +231,15 @@ func (ts *TwitterScraper) ScrapeTweetsProfile(j types.Job, baseDir string, usern return profile, nil } -func (ts *TwitterScraper) ScrapeTweetsByFullArchiveSearchQuery(j types.Job, baseDir string, query string, count int) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) ScrapeTweetsByFullArchiveSearchQuery(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { return ts.queryTweets(j, twitterx.TweetsAll, baseDir, query, count) } -func (ts *TwitterScraper) ScrapeTweetsByRecentSearchQuery(j types.Job, baseDir string, query string, count int) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) ScrapeTweetsByRecentSearchQuery(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { return ts.queryTweets(j, twitterx.TweetsSearchRecent, baseDir, query, count) } -func (ts *TwitterScraper) queryTweets(j types.Job, baseQueryEndpoint string, baseDir string, query string, count int) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) queryTweets(j types.Job, baseQueryEndpoint string, baseDir string, query string, count int) ([]*types.TweetResult, error) { // Try credentials first, fallback to API for CapSearchByQuery scraper, account, err := ts.getCredentialScraper(j, baseDir) if err == nil { @@ -257,7 +255,7 @@ func (ts *TwitterScraper) queryTweets(j types.Job, baseQueryEndpoint string, bas return ts.scrapeTweets(j, baseQueryEndpoint, query, count, twitterXScraper, apiKey) } -func (ts *TwitterScraper) queryTweetsWithCredentials(j types.Job, baseDir string, query string, count int) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) queryTweetsWithCredentials(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, err @@ -265,7 +263,7 @@ func (ts *TwitterScraper) queryTweetsWithCredentials(j types.Job, baseDir string return ts.scrapeTweetsWithCredentials(j, query, count, scraper, account) } -func (ts *TwitterScraper) queryTweetsWithApiKey(j types.Job, baseQueryEndpoint string, query string, count int) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) queryTweetsWithApiKey(j types.Job, baseQueryEndpoint string, query string, count int) ([]*types.TweetResult, error) { twitterXScraper, apiKey, err := ts.getApiScraper(j) if err != nil { return nil, err @@ -273,9 +271,9 @@ func (ts *TwitterScraper) queryTweetsWithApiKey(j types.Job, baseQueryEndpoint s return ts.scrapeTweets(j, baseQueryEndpoint, query, count, twitterXScraper, apiKey) } -func (ts *TwitterScraper) scrapeTweetsWithCredentials(j types.Job, query string, count int, scraper *twitter.Scraper, account *twitter.TwitterAccount) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) scrapeTweetsWithCredentials(j types.Job, query string, count int, scraper *twitter.Scraper, account *twitter.TwitterAccount) ([]*types.TweetResult, error) { ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - tweets := make([]*teetypes.TweetResult, 0, count) + tweets := make([]*types.TweetResult, 0, count) ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) defer cancel() @@ -296,14 +294,14 @@ func (ts *TwitterScraper) scrapeTweetsWithCredentials(j types.Job, query string, } // scrapeTweets uses an existing scraper instance -func (ts *TwitterScraper) scrapeTweets(j types.Job, baseQueryEndpoint string, query string, count int, twitterXScraper *twitterx.TwitterXScraper, apiKey *twitter.TwitterApiKey) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) scrapeTweets(j types.Job, baseQueryEndpoint string, query string, count int, twitterXScraper *twitterx.TwitterXScraper, apiKey *twitter.TwitterApiKey) ([]*types.TweetResult, error) { ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) if baseQueryEndpoint == twitterx.TweetsAll && apiKey.Type == twitter.TwitterApiKeyTypeBase { return nil, fmt.Errorf("this API key is a base/Basic key and does not have access to full archive search. Please use an elevated/Pro API key") } - tweets := make([]*teetypes.TweetResult, 0, count) + tweets := make([]*types.TweetResult, 0, count) cursor := "" deadline := time.Now().Add(j.Timeout) @@ -339,7 +337,7 @@ func (ts *TwitterScraper) scrapeTweets(j types.Job, baseQueryEndpoint string, qu return nil, fmt.Errorf("failed to parse tweet ID '%s' from twitterx: %w", tX.ID, convErr) } - newTweet := &teetypes.TweetResult{ + newTweet := &types.TweetResult{ ID: tweetIDInt, TweetID: tX.ID, AuthorID: tX.AuthorID, @@ -357,7 +355,7 @@ func (ts *TwitterScraper) scrapeTweets(j types.Job, baseQueryEndpoint string, qu //} //if tX.PublicMetrics != nil { - newTweet.PublicMetrics = teetypes.PublicMetrics{ + newTweet.PublicMetrics = types.PublicMetrics{ RetweetCount: tX.PublicMetrics.RetweetCount, ReplyCount: tX.PublicMetrics.ReplyCount, LikeCount: tX.PublicMetrics.LikeCount, @@ -365,7 +363,7 @@ func (ts *TwitterScraper) scrapeTweets(j types.Job, baseQueryEndpoint string, qu BookmarkCount: tX.PublicMetrics.BookmarkCount, } //} - // if tX.PossiblySensitive is available in twitterx.TweetData and teetypes.TweetResult has PossiblySensitive: + // if tX.PossiblySensitive is available in twitterx.TweetData and types.TweetResult has PossiblySensitive: // newTweet.PossiblySensitive = tX.PossiblySensitive // Also, fields like IsQuoted, Photos, Videos etc. would need to be populated if tX provides them. // Currently, this mapping is simpler than convertTwitterScraperTweetToTweetResult. @@ -393,7 +391,7 @@ EndLoop: return tweets, nil } -func (ts *TwitterScraper) ScrapeTweetByID(j types.Job, baseDir string, tweetID string) (*teetypes.TweetResult, error) { +func (ts *TwitterScraper) ScrapeTweetByID(j types.Job, baseDir string, tweetID string) (*types.TweetResult, error) { ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) scraper, account, err := ts.getCredentialScraper(j, baseDir) @@ -415,7 +413,7 @@ func (ts *TwitterScraper) ScrapeTweetByID(j types.Job, baseDir string, tweetID s return tweetResult, nil } -func (ts *TwitterScraper) GetTweet(j types.Job, baseDir, tweetID string) (*teetypes.TweetResult, error) { +func (ts *TwitterScraper) GetTweet(j types.Job, baseDir, tweetID string) (*types.TweetResult, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, err @@ -435,14 +433,14 @@ func (ts *TwitterScraper) GetTweet(j types.Job, baseDir, tweetID string) (*teety return tweetResult, nil } -func (ts *TwitterScraper) GetTweetReplies(j types.Job, baseDir, tweetID string, cursor string) ([]*teetypes.TweetResult, error) { +func (ts *TwitterScraper) GetTweetReplies(j types.Job, baseDir, tweetID string, cursor string) ([]*types.TweetResult, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, err } ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var replies []*teetypes.TweetResult + var replies []*types.TweetResult scrapedTweets, threadEntries, err := scraper.GetTweetReplies(tweetID, cursor) if err != nil { @@ -453,7 +451,7 @@ func (ts *TwitterScraper) GetTweetReplies(j types.Job, baseDir, tweetID string, for i, scrapedTweet := range scrapedTweets { newTweetResult := ts.convertTwitterScraperTweetToTweetResult(*scrapedTweet) if i < len(threadEntries) { - // Assuming teetypes.TweetResult has a ThreadCursor field (struct, not pointer) + // Assuming types.TweetResult has a ThreadCursor field (struct, not pointer) newTweetResult.ThreadCursor.Cursor = threadEntries[i].Cursor newTweetResult.ThreadCursor.CursorType = threadEntries[i].CursorType newTweetResult.ThreadCursor.FocalTweetID = threadEntries[i].FocalTweetID @@ -484,14 +482,14 @@ func (ts *TwitterScraper) GetTweetRetweeters(j types.Job, baseDir, tweetID strin return retweeters, nil } -func (ts *TwitterScraper) GetUserTweets(j types.Job, baseDir, username string, count int, cursor string) ([]*teetypes.TweetResult, string, error) { +func (ts *TwitterScraper) GetUserTweets(j types.Job, baseDir, username string, count int, cursor string) ([]*types.TweetResult, string, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, "", err } ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var tweets []*teetypes.TweetResult + var tweets []*types.TweetResult var nextCursor string if cursor != "" { @@ -524,14 +522,14 @@ func (ts *TwitterScraper) GetUserTweets(j types.Job, baseDir, username string, c return tweets, nextCursor, nil } -func (ts *TwitterScraper) GetUserMedia(j types.Job, baseDir, username string, count int, cursor string) ([]*teetypes.TweetResult, string, error) { +func (ts *TwitterScraper) GetUserMedia(j types.Job, baseDir, username string, count int, cursor string) ([]*types.TweetResult, string, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, "", err } ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var media []*teetypes.TweetResult + var media []*types.TweetResult var nextCursor string ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) defer cancel() @@ -585,14 +583,14 @@ func (ts *TwitterScraper) GetUserMedia(j types.Job, baseDir, username string, co return media, nextCursor, nil } -func (ts *TwitterScraper) GetHomeTweets(j types.Job, baseDir string, count int, cursor string) ([]*teetypes.TweetResult, string, error) { +func (ts *TwitterScraper) GetHomeTweets(j types.Job, baseDir string, count int, cursor string) ([]*types.TweetResult, string, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, "", err } ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var tweets []*teetypes.TweetResult + var tweets []*types.TweetResult var nextCursor string if cursor != "" { @@ -628,14 +626,14 @@ func (ts *TwitterScraper) GetHomeTweets(j types.Job, baseDir string, count int, return tweets, nextCursor, nil } -func (ts *TwitterScraper) GetForYouTweets(j types.Job, baseDir string, count int, cursor string) ([]*teetypes.TweetResult, string, error) { +func (ts *TwitterScraper) GetForYouTweets(j types.Job, baseDir string, count int, cursor string) ([]*types.TweetResult, string, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, "", err } ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var tweets []*teetypes.TweetResult + var tweets []*types.TweetResult var nextCursor string if cursor != "" { @@ -671,13 +669,13 @@ func (ts *TwitterScraper) GetForYouTweets(j types.Job, baseDir string, count int return tweets, nextCursor, nil } -func (ts *TwitterScraper) GetBookmarks(j types.Job, baseDir string, count int, cursor string) ([]*teetypes.TweetResult, string, error) { +func (ts *TwitterScraper) GetBookmarks(j types.Job, baseDir string, count int, cursor string) ([]*types.TweetResult, string, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, "", err } ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var bookmarks []*teetypes.TweetResult + var bookmarks []*types.TweetResult ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) defer cancel() @@ -752,7 +750,7 @@ func (ts *TwitterScraper) GetProfileByIDWithApiKey(j types.Job, userID string, a } // GetTweetByIDWithApiKey fetches a tweet using Twitter API key -func (ts *TwitterScraper) GetTweetByIDWithApiKey(j types.Job, tweetID string, apiKey *twitter.TwitterApiKey) (*teetypes.TweetResult, error) { +func (ts *TwitterScraper) GetTweetByIDWithApiKey(j types.Job, tweetID string, apiKey *twitter.TwitterApiKey) (*types.TweetResult, error) { ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) apiClient := client.NewTwitterXClient(apiKey.Key) @@ -780,7 +778,7 @@ func (ts *TwitterScraper) GetTweetByIDWithApiKey(j types.Job, tweetID string, ap createdAt = time.Now() // fallback to current time } - tweetResult := &teetypes.TweetResult{ + tweetResult := &types.TweetResult{ ID: tweetIDInt, TweetID: tweetData.ID, AuthorID: tweetData.AuthorID, @@ -790,7 +788,7 @@ func (ts *TwitterScraper) GetTweetByIDWithApiKey(j types.Job, tweetID string, ap CreatedAt: createdAt, Username: tweetData.Username, Lang: tweetData.Lang, - PublicMetrics: teetypes.PublicMetrics{ + PublicMetrics: types.PublicMetrics{ RetweetCount: tweetData.PublicMetrics.RetweetCount, ReplyCount: tweetData.PublicMetrics.ReplyCount, LikeCount: tweetData.PublicMetrics.LikeCount, @@ -873,7 +871,7 @@ func (ts *TwitterScraper) GetFollowing(j types.Job, baseDir, username string, co } // getFollowersApify retrieves followers using Apify -func (ts *TwitterScraper) getFollowersApify(j types.Job, username string, maxResults uint, cursor client.Cursor) ([]*teetypes.ProfileResultApify, client.Cursor, error) { +func (ts *TwitterScraper) getFollowersApify(j types.Job, username string, maxResults uint, cursor client.Cursor) ([]*types.ProfileResultApify, client.Cursor, error) { apifyScraper, err := ts.getApifyScraper(j) if err != nil { return nil, "", err @@ -891,7 +889,7 @@ func (ts *TwitterScraper) getFollowersApify(j types.Job, username string, maxRes } // getFollowingApify retrieves following using Apify -func (ts *TwitterScraper) getFollowingApify(j types.Job, username string, maxResults uint, cursor client.Cursor) ([]*teetypes.ProfileResultApify, client.Cursor, error) { +func (ts *TwitterScraper) getFollowingApify(j types.Job, username string, maxResults uint, cursor client.Cursor) ([]*types.ProfileResultApify, client.Cursor, error) { apifyScraper, err := ts.getApifyScraper(j) if err != nil { return nil, "", err @@ -967,7 +965,7 @@ type TwitterScraper struct { configuration config.TwitterScraperConfig accountManager *twitter.TwitterAccountManager statsCollector *stats.StatsCollector - capabilities map[teetypes.Capability]bool + capabilities map[types.Capability]bool } func NewTwitterScraper(jc config.JobConfiguration, c *stats.StatsCollector) *TwitterScraper { @@ -985,71 +983,71 @@ func NewTwitterScraper(jc config.JobConfiguration, c *stats.StatsCollector) *Twi configuration: config, accountManager: accountManager, statsCollector: c, - capabilities: map[teetypes.Capability]bool{ - teetypes.CapSearchByQuery: true, - teetypes.CapSearchByFullArchive: true, - teetypes.CapSearchByProfile: true, - teetypes.CapGetById: true, - teetypes.CapGetReplies: true, - teetypes.CapGetRetweeters: true, - teetypes.CapGetTweets: true, - teetypes.CapGetMedia: true, - teetypes.CapGetHomeTweets: true, - teetypes.CapGetForYouTweets: true, - teetypes.CapGetProfileById: true, - teetypes.CapGetTrends: true, - teetypes.CapGetFollowing: true, - teetypes.CapGetFollowers: true, - teetypes.CapGetSpace: true, + capabilities: map[types.Capability]bool{ + types.CapSearchByQuery: true, + types.CapSearchByFullArchive: true, + types.CapSearchByProfile: true, + types.CapGetById: true, + types.CapGetReplies: true, + types.CapGetRetweeters: true, + types.CapGetTweets: true, + types.CapGetMedia: true, + types.CapGetHomeTweets: true, + types.CapGetForYouTweets: true, + types.CapGetProfileById: true, + types.CapGetTrends: true, + types.CapGetFollowing: true, + types.CapGetFollowers: true, + types.CapGetSpace: true, }, } } // GetStructuredCapabilities returns the structured capabilities supported by this Twitter scraper // based on the available credentials and API keys -func (ts *TwitterScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { - capabilities := make(teetypes.WorkerCapabilities) +func (ts *TwitterScraper) GetStructuredCapabilities() types.WorkerCapabilities { + capabilities := make(types.WorkerCapabilities) // Check if we have Twitter accounts for credential-based scraping if len(ts.configuration.Accounts) > 0 { - var credCaps []teetypes.Capability + var credCaps []types.Capability for capability, enabled := range ts.capabilities { if enabled { credCaps = append(credCaps, capability) } } if len(credCaps) > 0 { - capabilities[teetypes.TwitterCredentialJob] = credCaps + capabilities[types.TwitterCredentialJob] = credCaps } } // Check if we have API keys for API-based scraping if len(ts.configuration.ApiKeys) > 0 { - apiCaps := make([]teetypes.Capability, len(teetypes.TwitterAPICaps)) - copy(apiCaps, teetypes.TwitterAPICaps) + apiCaps := make([]types.Capability, len(types.TwitterAPICaps)) + copy(apiCaps, types.TwitterAPICaps) // Check for elevated API capabilities if ts.accountManager != nil { for _, apiKey := range ts.accountManager.GetApiKeys() { if apiKey.Type == twitter.TwitterApiKeyTypeElevated { - apiCaps = append(apiCaps, teetypes.CapSearchByFullArchive) + apiCaps = append(apiCaps, types.CapSearchByFullArchive) break } } } - capabilities[teetypes.TwitterApiJob] = apiCaps + capabilities[types.TwitterApiJob] = apiCaps } // Add Apify-specific capabilities based on available API key // TODO: We should verify whether each of the actors is actually available through this API key if ts.configuration.ApifyApiKey != "" { - capabilities[teetypes.TwitterApifyJob] = teetypes.TwitterApifyCaps + capabilities[types.TwitterApifyJob] = types.TwitterApifyCaps } // Add general twitter scraper capability (uses best available method) if len(ts.configuration.Accounts) > 0 || len(ts.configuration.ApiKeys) > 0 { - var generalCaps []teetypes.Capability + var generalCaps []types.Capability if len(ts.configuration.Accounts) > 0 { // Use all capabilities if we have accounts for capability, enabled := range ts.capabilities { @@ -1059,36 +1057,36 @@ func (ts *TwitterScraper) GetStructuredCapabilities() teetypes.WorkerCapabilitie } } else { // Use API capabilities if we only have keys - generalCaps = make([]teetypes.Capability, len(teetypes.TwitterAPICaps)) - copy(generalCaps, teetypes.TwitterAPICaps) + generalCaps = make([]types.Capability, len(types.TwitterAPICaps)) + copy(generalCaps, types.TwitterAPICaps) // Check for elevated capabilities if ts.accountManager != nil { for _, apiKey := range ts.accountManager.GetApiKeys() { if apiKey.Type == twitter.TwitterApiKeyTypeElevated { - generalCaps = append(generalCaps, teetypes.CapSearchByFullArchive) + generalCaps = append(generalCaps, types.CapSearchByFullArchive) break } } } } - capabilities[teetypes.TwitterJob] = generalCaps + capabilities[types.TwitterJob] = generalCaps } return capabilities } type TwitterScrapeStrategy interface { - Execute(j types.Job, ts *TwitterScraper, jobArgs *teeargs.TwitterSearchArguments) (types.JobResult, error) + Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) } -func getScrapeStrategy(jobType teetypes.JobType) TwitterScrapeStrategy { +func getScrapeStrategy(jobType types.JobType) TwitterScrapeStrategy { switch jobType { - case teetypes.TwitterCredentialJob: + case types.TwitterCredentialJob: return &CredentialScrapeStrategy{} - case teetypes.TwitterApiJob: + case types.TwitterApiJob: return &ApiKeyScrapeStrategy{} - case teetypes.TwitterApifyJob: + case types.TwitterApifyJob: return &ApifyScrapeStrategy{} default: return &DefaultScrapeStrategy{} @@ -1097,13 +1095,13 @@ func getScrapeStrategy(jobType teetypes.JobType) TwitterScrapeStrategy { type CredentialScrapeStrategy struct{} -func (s *CredentialScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *teeargs.TwitterSearchArguments) (types.JobResult, error) { +func (s *CredentialScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { capability := jobArgs.GetCapability() switch capability { - case teetypes.CapSearchByQuery: + case types.CapSearchByQuery: tweets, err := ts.queryTweetsWithCredentials(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) - case teetypes.CapSearchByFullArchive: + case types.CapSearchByFullArchive: logrus.Warn("Full archive search with credential-only implementation may have limited results") tweets, err := ts.queryTweetsWithCredentials(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) @@ -1114,23 +1112,23 @@ func (s *CredentialScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobA type ApiKeyScrapeStrategy struct{} -func (s *ApiKeyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *teeargs.TwitterSearchArguments) (types.JobResult, error) { +func (s *ApiKeyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { capability := jobArgs.GetCapability() switch capability { - case teetypes.CapSearchByQuery: + case types.CapSearchByQuery: tweets, err := ts.queryTweetsWithApiKey(j, twitterx.TweetsSearchRecent, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) - case teetypes.CapSearchByFullArchive: + case types.CapSearchByFullArchive: tweets, err := ts.queryTweetsWithApiKey(j, twitterx.TweetsAll, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) - case teetypes.CapGetProfileById: + case types.CapGetProfileById: _, apiKey, err := ts.getApiScraper(j) if err != nil { return types.JobResult{Error: err.Error()}, err } profile, err := ts.GetProfileByIDWithApiKey(j, jobArgs.Query, apiKey) return processResponse(profile, "", err) - case teetypes.CapGetById: + case types.CapGetById: _, apiKey, err := ts.getApiScraper(j) if err != nil { return types.JobResult{Error: err.Error()}, err @@ -1144,13 +1142,13 @@ func (s *ApiKeyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs type ApifyScrapeStrategy struct{} -func (s *ApifyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *teeargs.TwitterSearchArguments) (types.JobResult, error) { - capability := teetypes.Capability(jobArgs.QueryType) +func (s *ApifyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { + capability := types.Capability(jobArgs.QueryType) switch capability { - case teetypes.CapGetFollowers: + case types.CapGetFollowers: followers, nextCursor, err := ts.getFollowersApify(j, jobArgs.Query, uint(jobArgs.MaxResults), client.Cursor(jobArgs.NextCursor)) return processResponse(followers, nextCursor.String(), err) - case teetypes.CapGetFollowing: + case types.CapGetFollowing: following, nextCursor, err := ts.getFollowingApify(j, jobArgs.Query, uint(jobArgs.MaxResults), client.Cursor(jobArgs.NextCursor)) return processResponse(following, nextCursor.String(), err) default: @@ -1161,10 +1159,10 @@ func (s *ApifyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs * type DefaultScrapeStrategy struct{} // FIXED: Now using validated QueryType from centralized unmarshaller (addresses the TODO comment) -func (s *DefaultScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *teeargs.TwitterSearchArguments) (types.JobResult, error) { - capability := teetypes.Capability(jobArgs.QueryType) +func (s *DefaultScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { + capability := types.Capability(jobArgs.QueryType) switch capability { - case teetypes.CapGetFollowers, teetypes.CapGetFollowing: + case types.CapGetFollowers, types.CapGetFollowing: // Priority: Apify > Credentials for general TwitterJob // TODO: We should verify whether each of the actors is actually available through this API key if ts.configuration.ApifyApiKey != "" { @@ -1175,7 +1173,7 @@ func (s *DefaultScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs // Fall back to credential-based strategy credentialStrategy := &CredentialScrapeStrategy{} return credentialStrategy.Execute(j, ts, jobArgs) - case teetypes.CapSearchByQuery: + case types.CapSearchByQuery: // Priority: Credentials > API for searchbyquery if len(ts.configuration.Accounts) > 0 { credentialStrategy := &CredentialScrapeStrategy{} @@ -1184,7 +1182,7 @@ func (s *DefaultScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs // Fall back to API strategy tweets, err := ts.queryTweets(j, twitterx.TweetsSearchRecent, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) - case teetypes.CapSearchByFullArchive: + case types.CapSearchByFullArchive: tweets, err := ts.queryTweets(j, twitterx.TweetsAll, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) default: @@ -1269,48 +1267,48 @@ func processResponse(response any, nextCursor string, err error) (types.JobResul return types.JobResult{Data: dat, NextCursor: nextCursor}, nil } -func defaultStrategyFallback(j types.Job, ts *TwitterScraper, jobArgs *teeargs.TwitterSearchArguments) (types.JobResult, error) { +func defaultStrategyFallback(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { capability := jobArgs.GetCapability() switch capability { - case teetypes.CapSearchByProfile: + case types.CapSearchByProfile: profile, err := ts.ScrapeTweetsProfile(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(profile, "", err) - case teetypes.CapGetById: + case types.CapGetById: tweet, err := ts.GetTweet(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(tweet, "", err) - case teetypes.CapGetReplies: + case types.CapGetReplies: // GetTweetReplies takes a cursor for a specific part of a thread, not general pagination of all replies. // The retryWithCursor logic might not directly apply unless GetTweetReplies is adapted for broader pagination. replies, err := ts.GetTweetReplies(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.NextCursor) return processResponse(replies, jobArgs.NextCursor, err) // Pass original NextCursor as it's specific - case teetypes.CapGetRetweeters: + case types.CapGetRetweeters: // Similar to GetTweetReplies, cursor is for a specific page. retweeters, err := ts.GetTweetRetweeters(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor) // GetTweetRetweeters in twitterscraper returns (profiles, nextCursorStr, error) // The current ts.GetTweetRetweeters doesn't return the next cursor. This should be updated if pagination is needed here. // For now, assuming it fetches one batch or handles its own pagination internally up to MaxResults. return processResponse(retweeters, "", err) // Assuming no next cursor from this specific call structure - case teetypes.CapGetTweets: + case types.CapGetTweets: return retryWithCursorAndQuery(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetUserTweets) - case teetypes.CapGetMedia: + case types.CapGetMedia: return retryWithCursorAndQuery(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetUserMedia) - case teetypes.CapGetHomeTweets: + case types.CapGetHomeTweets: return retryWithCursor(j, ts.configuration.DataDir, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetHomeTweets) - case teetypes.CapGetForYouTweets: + case types.CapGetForYouTweets: return retryWithCursor(j, ts.configuration.DataDir, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetForYouTweets) - case teetypes.CapGetProfileById: + case types.CapGetProfileById: profile, err := ts.GetProfileByID(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(profile, "", err) - case teetypes.CapGetTrends: + case types.CapGetTrends: trends, err := ts.GetTrends(j, ts.configuration.DataDir) return processResponse(trends, "", err) - case teetypes.CapGetFollowing: + case types.CapGetFollowing: following, err := ts.GetFollowing(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) return processResponse(following, "", err) - case teetypes.CapGetFollowers: + case types.CapGetFollowers: followers, err := ts.GetFollowers(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) return processResponse(followers, "", err) - case teetypes.CapGetSpace: + case types.CapGetSpace: space, err := ts.GetSpace(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(space, "", err) } @@ -1326,14 +1324,14 @@ func defaultStrategyFallback(j types.Job, ts *TwitterScraper, jobArgs *teeargs.T // If the unmarshaled result is empty, it returns an error. func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { // Use the centralized unmarshaller from tee-types - this addresses the TODO comment! - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { logrus.Errorf("Error while unmarshalling job arguments for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling job arguments"}, err } // Type assert to Twitter arguments - args, ok := jobArgs.(*teeargs.TwitterSearchArguments) + args, ok := jobArgs.(*args.TwitterSearchArguments) if !ok { logrus.Errorf("Expected Twitter arguments for job ID %s, type %s", j.UUID, j.Type) return types.JobResult{Error: "invalid argument type for Twitter job"}, fmt.Errorf("invalid argument type") @@ -1358,13 +1356,13 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { switch { case args.IsSingleTweetOperation(): - var result *teetypes.TweetResult + var result *types.TweetResult if err := jobResult.Unmarshal(&result); err != nil { logrus.Errorf("Error while unmarshalling single tweet result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling single tweet result for final validation"}, err } case args.IsMultipleTweetOperation(): - var results []*teetypes.TweetResult + var results []*types.TweetResult if err := jobResult.Unmarshal(&results); err != nil { logrus.Errorf("Error while unmarshalling multiple tweet result for job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error unmarshalling multiple tweet result for final validation"}, err diff --git a/internal/jobs/twitter_test.go b/internal/jobs/twitter_test.go index 04022a1..008c674 100644 --- a/internal/jobs/twitter_test.go +++ b/internal/jobs/twitter_test.go @@ -7,7 +7,6 @@ import ( "strings" "time" - teetypes "github.com/masa-finance/tee-worker/api/types" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -106,9 +105,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "NASA", "max_results": 1, }, @@ -116,7 +115,7 @@ var _ = Describe("Twitter Scraper", func() { }) Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -131,9 +130,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, + Type: types.TwitterApiJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "NASA", "max_results": 1, }, @@ -141,7 +140,7 @@ var _ = Describe("Twitter Scraper", func() { }) Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -157,9 +156,9 @@ var _ = Describe("Twitter Scraper", func() { }, statsCollector) // Try to run credential-only job with only API key res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "NASA", "max_results": 1, }, @@ -179,9 +178,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "nasa", "max_results": 10, }, @@ -189,7 +188,7 @@ var _ = Describe("Twitter Scraper", func() { }) Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -200,9 +199,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, + Type: types.TwitterApiJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "NASA", "max_results": 1, }, @@ -221,9 +220,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, + Type: types.TwitterApiJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByFullArchive, + "type": types.CapSearchByFullArchive, "query": "NASA", "max_results": 1, }, @@ -235,7 +234,7 @@ var _ = Describe("Twitter Scraper", func() { } Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -246,9 +245,9 @@ var _ = Describe("Twitter Scraper", func() { Context("General Twitter Scraper Tests", func() { It("should scrape tweets with a search query", func() { j := types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "nasa", "max_results": 10, }, @@ -258,7 +257,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -276,9 +275,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByProfile, + "type": types.CapSearchByProfile, "query": "NASA_Marshall", }, Timeout: 10 * time.Second, @@ -303,9 +302,9 @@ var _ = Describe("Twitter Scraper", func() { It("should get tweet by ID", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetById, + "type": types.CapGetById, "query": "1881258110712492142", }, Timeout: 10 * time.Second, @@ -313,7 +312,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var tweet *teetypes.TweetResult + var tweet *types.TweetResult err = res.Unmarshal(&tweet) Expect(err).NotTo(HaveOccurred()) Expect(tweet).NotTo(BeNil()) @@ -326,9 +325,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetReplies, + "type": types.CapGetReplies, "query": "1234567890", }, Timeout: 10 * time.Second, @@ -337,7 +336,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var replies []*teetypes.TweetResult + var replies []*types.TweetResult err = res.Unmarshal(&replies) Expect(err).NotTo(HaveOccurred()) Expect(replies).ToNot(BeEmpty()) @@ -355,9 +354,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetRetweeters, + "type": types.CapGetRetweeters, "query": "1234567890", "max_results": 5, }, @@ -385,9 +384,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetTweets, + "type": types.CapGetTweets, "query": "NASA", "max_results": 5, }, @@ -397,7 +396,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var tweets []*teetypes.TweetResult + var tweets []*types.TweetResult err = res.Unmarshal(&tweets) Expect(err).NotTo(HaveOccurred()) Expect(len(tweets)).ToNot(BeZero()) @@ -415,9 +414,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetMedia, + "type": types.CapGetMedia, "query": "NASA", "max_results": 5, }, @@ -426,7 +425,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var media []*teetypes.TweetResult + var media []*types.TweetResult err = res.Unmarshal(&media) Expect(err).NotTo(HaveOccurred()) Expect(media).ToNot(BeEmpty()) @@ -438,9 +437,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetHomeTweets, + "type": types.CapGetHomeTweets, "max_results": 5, }, Timeout: 10 * time.Second, @@ -449,7 +448,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var tweets []*teetypes.TweetResult + var tweets []*types.TweetResult err = res.Unmarshal(&tweets) Expect(err).NotTo(HaveOccurred()) Expect(len(tweets)).ToNot(BeZero()) @@ -467,9 +466,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetForYouTweets, + "type": types.CapGetForYouTweets, "max_results": 5, }, Timeout: 10 * time.Second, @@ -479,7 +478,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var tweets []*teetypes.TweetResult + var tweets []*types.TweetResult err = res.Unmarshal(&tweets) Expect(err).NotTo(HaveOccurred()) Expect(len(tweets)).ToNot(BeZero()) @@ -498,9 +497,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetProfileById, + "type": types.CapGetProfileById, "query": "44196397", // Elon Musk's Twitter ID }, Timeout: 10 * time.Second, @@ -526,9 +525,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetFollowing, + "type": types.CapGetFollowing, "query": "NASA", "max_results": 5, }, @@ -556,9 +555,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetFollowers, + "type": types.CapGetFollowers, "query": "NASA", }, Timeout: 10 * time.Second, @@ -586,9 +585,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetTrends, + "type": types.CapGetTrends, }, Timeout: 10 * time.Second, } @@ -614,9 +613,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, + Type: types.TwitterApiJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetById, + "type": types.CapGetById, "query": "1881258110712492142", }, Timeout: 10 * time.Second, @@ -625,7 +624,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(res.Error).To(BeEmpty()) // Use the proper TweetResult type (the API converts TwitterXTweetData to TweetResult) - var tweet *teetypes.TweetResult + var tweet *types.TweetResult err = res.Unmarshal(&tweet) Expect(err).NotTo(HaveOccurred()) Expect(tweet).NotTo(BeNil()) @@ -656,9 +655,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, + Type: types.TwitterApiJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetProfileById, + "type": types.CapGetProfileById, "query": "44196397", // Elon Musk's Twitter ID }, Timeout: 10 * time.Second, @@ -688,9 +687,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("Needs to be constructed to fetch live spaces first - hard to test with hardcoded IDs") res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetSpace, + "type": types.CapGetSpace, "query": "1YpKkZEWlBaxj", }, Timeout: 10 * time.Second, @@ -708,7 +707,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("Returns 'job result is empty' even when account has bookmarks") j := types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": "getbookmarks", // not yet in teetypes until it's supported "max_results": 5, @@ -719,7 +718,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var bookmarks []*teetypes.TweetResult + var bookmarks []*types.TweetResult err = res.Unmarshal(&bookmarks) Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) @@ -735,9 +734,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("Needs full archive key in TWITTER_API_KEYS to run") j := types.Job{ - Type: teetypes.TwitterApiJob, + Type: types.TwitterApiJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByFullArchive, + "type": types.CapSearchByFullArchive, "query": "AI", "max_results": 2, }, @@ -747,7 +746,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -764,9 +763,9 @@ var _ = Describe("Twitter Scraper", func() { Skip("Needs full archive key (elevated) in TWITTER_API_KEYS to run") j := types.Job{ - Type: teetypes.TwitterCredentialJob, + Type: types.TwitterCredentialJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByFullArchive, + "type": types.CapSearchByFullArchive, "query": "#AI", "max_results": 2, }, @@ -776,7 +775,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var results []*teetypes.TweetResult + var results []*types.TweetResult err = res.Unmarshal(&results) Expect(err).NotTo(HaveOccurred()) Expect(results).ToNot(BeEmpty()) @@ -799,9 +798,9 @@ var _ = Describe("Twitter Scraper", func() { }, statsCollector) j := types.Job{ - Type: teetypes.TwitterApifyJob, + Type: types.TwitterApifyJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetFollowers, + "type": types.CapGetFollowers, "query": "elonmusk", "max_results": 200, }, @@ -812,7 +811,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var followers []*teetypes.ProfileResultApify + var followers []*types.ProfileResultApify err = res.Unmarshal(&followers) Expect(err).NotTo(HaveOccurred()) Expect(followers).ToNot(BeEmpty()) @@ -830,9 +829,9 @@ var _ = Describe("Twitter Scraper", func() { }, statsCollector) j := types.Job{ - Type: teetypes.TwitterApifyJob, + Type: types.TwitterApifyJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetFollowing, + "type": types.CapGetFollowing, "query": "elonmusk", "max_results": 200, }, @@ -843,7 +842,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(err).NotTo(HaveOccurred()) Expect(res.Error).To(BeEmpty()) - var following []*teetypes.ProfileResultApify + var following []*types.ProfileResultApify err = res.Unmarshal(&following) Expect(err).NotTo(HaveOccurred()) Expect(following).ToNot(BeEmpty()) @@ -861,9 +860,9 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapGetFollowers, + "type": types.CapGetFollowers, "query": "elonmusk", "max_results": 200, }, @@ -873,7 +872,7 @@ var _ = Describe("Twitter Scraper", func() { Expect(res.Error).To(BeEmpty()) // Should return ProfileResultApify (from Apify) not twitterscraper.Profile - var followers []*teetypes.ProfileResultApify + var followers []*types.ProfileResultApify err = res.Unmarshal(&followers) Expect(err).NotTo(HaveOccurred()) Expect(followers).ToNot(BeEmpty()) @@ -884,9 +883,9 @@ var _ = Describe("Twitter Scraper", func() { Context("Error Handling", func() { It("should handle negative count values in job arguments", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "test", "count": -5, // Invalid negative value }, @@ -899,9 +898,9 @@ var _ = Describe("Twitter Scraper", func() { It("should handle negative max_results values in job arguments", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "test", "max_results": -10, // Invalid negative value }, @@ -914,7 +913,7 @@ var _ = Describe("Twitter Scraper", func() { It("should handle invalid capability for job type", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, // API job type + Type: types.TwitterApiJob, // API job type Arguments: map[string]interface{}{ "type": "invalidcapability", // Invalid capability "query": "test", @@ -928,9 +927,9 @@ var _ = Describe("Twitter Scraper", func() { It("should handle capability not available for specific job type", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterApiJob, // API job type - doesn't support getfollowers + Type: types.TwitterApiJob, // API job type - doesn't support getfollowers Arguments: map[string]interface{}{ - "type": teetypes.CapGetFollowers, // Valid capability but not for TwitterApiJob + "type": types.CapGetFollowers, // Valid capability but not for TwitterApiJob "query": "test", }, Timeout: 10 * time.Second, @@ -943,9 +942,9 @@ var _ = Describe("Twitter Scraper", func() { It("should handle invalid JSON data structure", func() { // Create a job with arguments that will cause JSON unmarshalling to fail res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "test", "max_results": "not_a_number", // String instead of int }, @@ -961,7 +960,7 @@ var _ = Describe("Twitter Scraper", func() { res, err := twitterScraper.ExecuteJob(types.Job{ Type: "unknown-job-type", // Invalid job type Arguments: map[string]interface{}{ - "type": teetypes.CapSearchByQuery, + "type": types.CapSearchByQuery, "query": "test", }, Timeout: 10 * time.Second, @@ -973,7 +972,7 @@ var _ = Describe("Twitter Scraper", func() { It("should handle empty arguments map", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: teetypes.TwitterJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{}, // Empty arguments Timeout: 10 * time.Second, }) diff --git a/internal/jobs/twitterapify/client.go b/internal/jobs/twitterapify/client.go index 7346639..ccc411b 100644 --- a/internal/jobs/twitterapify/client.go +++ b/internal/jobs/twitterapify/client.go @@ -5,7 +5,6 @@ import ( "fmt" util "github.com/masa-finance/tee-worker/pkg/util" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" "github.com/sirupsen/logrus" @@ -44,7 +43,7 @@ func (c *TwitterApifyClient) ValidateApiKey() error { } // GetFollowers retrieves followers for a username using Apify -func (c *TwitterApifyClient) GetFollowers(username string, maxResults uint, cursor client.Cursor) ([]*teetypes.ProfileResultApify, client.Cursor, error) { +func (c *TwitterApifyClient) GetFollowers(username string, maxResults uint, cursor client.Cursor) ([]*types.ProfileResultApify, client.Cursor, error) { minimum := uint(200) // Ensure minimum of 200 as required by the actor @@ -63,7 +62,7 @@ func (c *TwitterApifyClient) GetFollowers(username string, maxResults uint, curs } // GetFollowing retrieves following for a username using Apify -func (c *TwitterApifyClient) GetFollowing(username string, cursor client.Cursor, maxResults uint) ([]*teetypes.ProfileResultApify, client.Cursor, error) { +func (c *TwitterApifyClient) GetFollowing(username string, cursor client.Cursor, maxResults uint) ([]*types.ProfileResultApify, client.Cursor, error) { minimum := uint(200) // Ensure minimum of 200 as required by the actor @@ -82,15 +81,15 @@ func (c *TwitterApifyClient) GetFollowing(username string, cursor client.Cursor, } // getProfiles runs the actor and retrieves profiles from the dataset -func (c *TwitterApifyClient) getProfiles(input FollowerActorRunRequest, cursor client.Cursor, limit uint) ([]*teetypes.ProfileResultApify, client.Cursor, error) { +func (c *TwitterApifyClient) getProfiles(input FollowerActorRunRequest, cursor client.Cursor, limit uint) ([]*types.ProfileResultApify, client.Cursor, error) { dataset, nextCursor, err := c.apifyClient.RunActorAndGetResponse(apify.ActorIds.TwitterFollowers, input, cursor, limit) if err != nil { return nil, client.EmptyCursor, err } - profiles := make([]*teetypes.ProfileResultApify, 0, len(dataset.Data.Items)) + profiles := make([]*types.ProfileResultApify, 0, len(dataset.Data.Items)) for i, item := range dataset.Data.Items { - var profile teetypes.ProfileResultApify + var profile types.ProfileResultApify if err := json.Unmarshal(item, &profile); err != nil { logrus.Warnf("Failed to unmarshal profile at index %d: %v", i, err) continue diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 2988d63..582196a 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -14,14 +14,12 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/pkg/util" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // WebApifyClient defines the interface for the Web Apify client to allow mocking in tests type WebApifyClient interface { - Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) + Scrape(workerID string, args args.WebArguments, cursor client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) } // NewWebApifyClient is a function variable that can be replaced in tests. @@ -33,7 +31,7 @@ var NewWebApifyClient = func(apiKey string, statsCollector *stats.StatsCollector // LLMApify is the interface for the LLM processor client // Only the Process method is required for this flow type LLMApify interface { - Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) + Process(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) } // NewLLMApifyClient is a function variable to allow injection in tests @@ -44,7 +42,7 @@ var NewLLMApifyClient = func(apiKey string, llmConfig config.LlmConfig, statsCol type WebScraper struct { configuration config.WebConfig statsCollector *stats.StatsCollector - capabilities []teetypes.Capability + capabilities []types.Capability } func NewWebScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollector) *WebScraper { @@ -53,7 +51,7 @@ func NewWebScraper(jc config.JobConfiguration, statsCollector *stats.StatsCollec return &WebScraper{ configuration: cfg, statsCollector: statsCollector, - capabilities: teetypes.WebCaps, + capabilities: types.WebCaps, } } @@ -66,13 +64,13 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: msg.Error()}, msg } - jobArgs, err := teeargs.UnmarshalJobArguments(teetypes.JobType(j.Type), map[string]any(j.Arguments)) + jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { msg := fmt.Errorf("failed to unmarshal job arguments: %w", err) return types.JobResult{Error: msg.Error()}, msg } - webArgs, ok := jobArgs.(*teeargs.WebArguments) + webArgs, ok := jobArgs.(*args.WebArguments) if !ok { return types.JobResult{Error: "invalid argument type for Web job"}, errors.New("invalid argument type") } @@ -98,11 +96,11 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: "error creating LLM Apify client"}, fmt.Errorf("failed to create LLM Apify client: %w", err) } - llmArgs := teeargs.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: datasetId, Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", - MaxTokens: teeargs.LLMDefaultMaxTokens, - Temperature: teeargs.LLMDefaultTemperature, + MaxTokens: args.LLMDefaultMaxTokens, + Temperature: args.LLMDefaultTemperature, Items: uint(len(webResp)), } llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) @@ -135,11 +133,11 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { // GetStructuredCapabilities returns the structured capabilities supported by the Web scraper // based on the available credentials and API keys -func (ws *WebScraper) GetStructuredCapabilities() teetypes.WorkerCapabilities { - capabilities := make(teetypes.WorkerCapabilities) +func (ws *WebScraper) GetStructuredCapabilities() types.WorkerCapabilities { + capabilities := make(types.WorkerCapabilities) if ws.configuration.ApifyApiKey != "" && ws.configuration.GeminiApiKey.IsValid() { - capabilities[teetypes.WebJob] = teetypes.WebCaps + capabilities[types.WebJob] = types.WebCaps } return capabilities diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index ec23026..e3ceda5 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -16,16 +16,14 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" ) // MockWebApifyClient is a mock implementation of the WebApifyClient. type MockWebApifyClient struct { - ScrapeFunc func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, string, client.Cursor, error) + ScrapeFunc func(args args.WebArguments) ([]*types.WebScraperResult, string, client.Cursor, error) } -func (m *MockWebApifyClient) Scrape(_ string, args teeargs.WebArguments, _ client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { +func (m *MockWebApifyClient) Scrape(_ string, args args.WebArguments, _ client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) { if m != nil && m.ScrapeFunc != nil { res, datasetId, next, err := m.ScrapeFunc(args) return res, datasetId, next, err @@ -36,14 +34,14 @@ func (m *MockWebApifyClient) Scrape(_ string, args teeargs.WebArguments, _ clien // MockLLMApifyClient is a mock implementation of the LLMApify interface // used to prevent external calls during unit tests. type MockLLMApifyClient struct { - ProcessFunc func(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) + ProcessFunc func(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) } -func (m *MockLLMApifyClient) Process(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { +func (m *MockLLMApifyClient) Process(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { if m != nil && m.ProcessFunc != nil { return m.ProcessFunc(workerID, args, cursor) } - return []*teetypes.LLMProcessorResult{}, client.EmptyCursor, nil + return []*types.LLMProcessorResult{}, client.EmptyCursor, nil } var _ = Describe("WebScraper", func() { @@ -68,9 +66,9 @@ var _ = Describe("WebScraper", func() { scraper = jobs.NewWebScraper(cfg, statsCollector) mockClient = &MockWebApifyClient{} mockLLM = &MockLLMApifyClient{ - ProcessFunc: func(workerID string, args teeargs.LLMProcessorArguments, cursor client.Cursor) ([]*teetypes.LLMProcessorResult, client.Cursor, error) { + ProcessFunc: func(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { // Return a single empty summary to avoid changing expectations - return []*teetypes.LLMProcessorResult{{LLMResponse: ""}}, client.EmptyCursor, nil + return []*types.LLMProcessorResult{{LLMResponse: ""}}, client.EmptyCursor, nil }, } @@ -84,7 +82,7 @@ var _ = Describe("WebScraper", func() { job = types.Job{ UUID: "test-uuid", - Type: teetypes.WebJob, + Type: types.WebJob, } }) @@ -103,22 +101,22 @@ var _ = Describe("WebScraper", func() { It("should call Scrape and return data and next cursor", func() { job.Arguments = map[string]any{ - "type": teetypes.WebScraper, + "type": types.WebScraper, "url": "https://example.com", "max_depth": 1, "max_pages": 2, } - mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { + mockClient.ScrapeFunc = func(args args.WebArguments) ([]*types.WebScraperResult, string, client.Cursor, error) { Expect(args.URL).To(Equal("https://example.com")) - return []*teetypes.WebScraperResult{{URL: "https://example.com", Markdown: "# Hello"}}, "dataset-123", client.Cursor("next-cursor"), nil + return []*types.WebScraperResult{{URL: "https://example.com", Markdown: "# Hello"}}, "dataset-123", client.Cursor("next-cursor"), nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-cursor")) - var resp []*teetypes.WebScraperResult + var resp []*types.WebScraperResult err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -128,14 +126,14 @@ var _ = Describe("WebScraper", func() { It("should handle errors from the web client", func() { job.Arguments = map[string]any{ - "type": teetypes.WebScraper, + "type": types.WebScraper, "url": "https://example.com", "max_depth": 0, "max_pages": 1, } expectedErr := errors.New("client error") - mockClient.ScrapeFunc = func(args teeargs.WebArguments) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { + mockClient.ScrapeFunc = func(args args.WebArguments) ([]*types.WebScraperResult, string, client.Cursor, error) { return nil, "", client.EmptyCursor, expectedErr } @@ -150,7 +148,7 @@ var _ = Describe("WebScraper", func() { return nil, errors.New("client creation failed") } job.Arguments = map[string]any{ - "type": teetypes.WebScraper, + "type": types.WebScraper, "url": "https://example.com", "max_depth": 0, "max_pages": 1, @@ -202,9 +200,9 @@ var _ = Describe("WebScraper", func() { job := types.Job{ UUID: "integration-test-uuid", - Type: teetypes.WebJob, + Type: types.WebJob, Arguments: map[string]any{ - "type": teetypes.WebScraper, + "type": types.WebScraper, "url": "https://docs.learnbittensor.org", "max_depth": maxDepth, "max_pages": maxPages, @@ -216,7 +214,7 @@ var _ = Describe("WebScraper", func() { Expect(result.Error).To(BeEmpty()) Expect(result.Data).NotTo(BeEmpty()) - var resp []*teetypes.WebScraperResult + var resp []*types.WebScraperResult err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) @@ -241,10 +239,10 @@ var _ = Describe("WebScraper", func() { caps := integrationScraper.GetStructuredCapabilities() if apifyKey != "" && geminiKey != "" { - Expect(caps[teetypes.WebJob]).NotTo(BeEmpty()) + Expect(caps[types.WebJob]).NotTo(BeEmpty()) } else { // Expect no capabilities when either key is missing - _, ok := caps[teetypes.WebJob] + _, ok := caps[types.WebJob] Expect(ok).To(BeFalse()) } }) diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index 7ed29cf..866238f 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -4,8 +4,6 @@ import ( "encoding/json" "fmt" - teeargs "github.com/masa-finance/tee-worker/api/args" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" @@ -41,7 +39,7 @@ func (c *ApifyClient) ValidateApiKey() error { return c.client.ValidateApiKey() } -func (c *ApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor client.Cursor) ([]*teetypes.WebScraperResult, string, client.Cursor, error) { +func (c *ApifyClient) Scrape(workerID string, args args.WebArguments, cursor client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.WebQueries, 1) } @@ -57,10 +55,10 @@ func (c *ApifyClient) Scrape(workerID string, args teeargs.WebArguments, cursor return nil, "", client.EmptyCursor, err } - response := make([]*teetypes.WebScraperResult, 0, len(dataset.Data.Items)) + response := make([]*types.WebScraperResult, 0, len(dataset.Data.Items)) for i, item := range dataset.Data.Items { - var resp teetypes.WebScraperResult + var resp types.WebScraperResult if err := json.Unmarshal(item, &resp); err != nil { logrus.Warnf("Failed to unmarshal scrape result at index %d: %v", i, err) continue diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index aaefcb6..65c7f19 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -12,7 +12,6 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - teeargs "github.com/masa-finance/tee-worker/api/args" ) // MockApifyClient is a mock implementation of the ApifyClient. @@ -66,7 +65,7 @@ var _ = Describe("WebApifyClient", func() { Describe("Scrape", func() { It("should construct the correct actor input", func() { - args := teeargs.WebArguments{ + args := args.WebArguments{ URL: "https://example.com", MaxDepth: 1, MaxPages: 2, @@ -88,7 +87,7 @@ var _ = Describe("WebApifyClient", func() { return nil, "", expectedErr } - args := teeargs.WebArguments{ + args := args.WebArguments{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, @@ -108,7 +107,7 @@ var _ = Describe("WebApifyClient", func() { return dataset, "next", nil } - args := teeargs.WebArguments{ + args := args.WebArguments{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, @@ -133,7 +132,7 @@ var _ = Describe("WebApifyClient", func() { return dataset, "next", nil } - args := teeargs.WebArguments{ + args := args.WebArguments{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, @@ -194,7 +193,7 @@ var _ = Describe("WebApifyClient", func() { realClient, err := webapify.NewClient(apifyKey, nil) Expect(err).NotTo(HaveOccurred()) - args := teeargs.WebArguments{ + args := args.WebArguments{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, diff --git a/internal/jobserver/jobserver.go b/internal/jobserver/jobserver.go index e9af738..17de567 100644 --- a/internal/jobserver/jobserver.go +++ b/internal/jobserver/jobserver.go @@ -11,7 +11,6 @@ import ( "golang.org/x/exp/maps" "github.com/google/uuid" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" @@ -28,7 +27,7 @@ type JobServer struct { results *ResultCache jobConfiguration config.JobConfiguration - jobWorkers map[teetypes.JobType]*jobWorkerEntry + jobWorkers map[types.JobType]*jobWorkerEntry executedJobs map[string]bool } @@ -80,32 +79,32 @@ func NewJobServer(workers int, jc config.JobConfiguration) *JobServer { // Initialize job workers logrus.Info("Setting up job workers...") - jobworkers := map[teetypes.JobType]*jobWorkerEntry{ - teetypes.WebJob: { + jobworkers := map[types.JobType]*jobWorkerEntry{ + types.WebJob: { w: jobs.NewWebScraper(jc, s), }, - teetypes.TwitterJob: { + types.TwitterJob: { w: jobs.NewTwitterScraper(jc, s), }, - teetypes.TwitterCredentialJob: { + types.TwitterCredentialJob: { w: jobs.NewTwitterScraper(jc, s), // Uses the same implementation as standard Twitter scraper }, - teetypes.TwitterApiJob: { + types.TwitterApiJob: { w: jobs.NewTwitterScraper(jc, s), // Uses the same implementation as standard Twitter scraper }, - teetypes.TwitterApifyJob: { + types.TwitterApifyJob: { w: jobs.NewTwitterScraper(jc, s), // Register Apify job type with Twitter scraper }, - teetypes.TiktokJob: { + types.TiktokJob: { w: jobs.NewTikTokScraper(jc, s), }, - teetypes.RedditJob: { + types.RedditJob: { w: jobs.NewRedditScraper(jc, s), }, - teetypes.LinkedInJob: { + types.LinkedInJob: { w: jobs.NewLinkedInScraper(jc, s), }, - teetypes.TelemetryJob: { + types.TelemetryJob: { w: jobs.NewTelemetryJob(jc, s), }, } @@ -155,15 +154,15 @@ func NewJobServer(workers int, jc config.JobConfiguration) *JobServer { } // GetWorkerCapabilities returns the structured capabilities for all registered workers -func (js *JobServer) GetWorkerCapabilities() teetypes.WorkerCapabilities { +func (js *JobServer) GetWorkerCapabilities() types.WorkerCapabilities { // Use a map to deduplicate capabilities by job type - jobTypeCapMap := make(map[teetypes.JobType]map[teetypes.Capability]struct{}) + jobTypeCapMap := make(map[types.JobType]map[types.Capability]struct{}) for _, workerEntry := range js.jobWorkers { workerCapabilities := workerEntry.w.GetStructuredCapabilities() for jobType, capabilities := range workerCapabilities { if _, exists := jobTypeCapMap[jobType]; !exists { - jobTypeCapMap[jobType] = make(map[teetypes.Capability]struct{}) + jobTypeCapMap[jobType] = make(map[types.Capability]struct{}) } for _, capability := range capabilities { jobTypeCapMap[jobType][capability] = struct{}{} @@ -172,7 +171,7 @@ func (js *JobServer) GetWorkerCapabilities() teetypes.WorkerCapabilities { } // Convert to final map format - allCapabilities := make(teetypes.WorkerCapabilities) + allCapabilities := make(types.WorkerCapabilities) for jobType, capabilitySet := range jobTypeCapMap { capabilities := maps.Keys(capabilitySet) allCapabilities[jobType] = capabilities @@ -203,7 +202,7 @@ func (js *JobServer) AddJob(j types.Job) (string, error) { return "", errors.New("this job is not for this worker") } - if j.Type != teetypes.TelemetryJob && config.MinersWhiteList != "" { + if j.Type != types.TelemetryJob && config.MinersWhiteList != "" { var miners []string // In standalone mode, we just whitelist ourselves diff --git a/internal/jobserver/jobserver_test.go b/internal/jobserver/jobserver_test.go index b908a37..adce7e9 100644 --- a/internal/jobserver/jobserver_test.go +++ b/internal/jobserver/jobserver_test.go @@ -8,7 +8,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" @@ -24,7 +23,7 @@ var _ = Describe("Jobserver", func() { jobserver := NewJobServer(2, config.JobConfiguration{}) uuid, err := jobserver.AddJob(types.Job{ - Type: teetypes.WebJob, + Type: types.WebJob, Arguments: map[string]any{ "url": "google", }, @@ -51,7 +50,7 @@ var _ = Describe("Jobserver", func() { jobserver := NewJobServer(2, config.JobConfiguration{}) uuid, err := jobserver.AddJob(types.Job{ - Type: teetypes.WebJob, + Type: types.WebJob, Arguments: map[string]any{ "url": "google", }, @@ -64,7 +63,7 @@ var _ = Describe("Jobserver", func() { Expect(err.Error()).To(ContainSubstring("this job is not from a whitelisted miner")) uuid, err = jobserver.AddJob(types.Job{ - Type: teetypes.WebJob, + Type: types.WebJob, WorkerID: "miner1", Arguments: map[string]any{ "url": "google", @@ -81,7 +80,7 @@ var _ = Describe("Jobserver", func() { jobserver := NewJobServer(2, config.JobConfiguration{}) uuid, err := jobserver.AddJob(types.Job{ - Type: teetypes.WebJob, + Type: types.WebJob, Arguments: map[string]any{ "url": "google", }, @@ -96,7 +95,7 @@ var _ = Describe("Jobserver", func() { Expect(exists).ToNot(BeTrue()) uuid, err = jobserver.AddJob(types.Job{ - Type: teetypes.WebJob, + Type: types.WebJob, Arguments: map[string]any{ "url": "google", }, diff --git a/internal/jobserver/worker.go b/internal/jobserver/worker.go index 939b199..cca9fa4 100644 --- a/internal/jobserver/worker.go +++ b/internal/jobserver/worker.go @@ -4,7 +4,6 @@ import ( "context" "fmt" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/sirupsen/logrus" ) @@ -26,7 +25,7 @@ func (js *JobServer) worker(c context.Context) { } type worker interface { - GetStructuredCapabilities() teetypes.WorkerCapabilities + GetStructuredCapabilities() types.WorkerCapabilities ExecuteJob(j types.Job) (types.JobResult, error) } From 1dd1e0f7c58c450b93b43c1b1dd8cd48f61ac7fa Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 21:33:40 +0200 Subject: [PATCH 112/136] feat: migrate tee types to worker --- api/types/jobs.go | 11 ++++++ internal/api/api_test.go | 1 + internal/api/routes.go | 3 +- internal/capabilities/detector.go | 3 +- internal/capabilities/detector_test.go | 1 + internal/jobs/linkedin.go | 1 + internal/jobs/llmapify/client.go | 2 ++ internal/jobs/llmapify/client_test.go | 41 +++++++++++----------- internal/jobs/reddit.go | 13 ++++--- internal/jobs/reddit_test.go | 44 +++++++++++------------- internal/jobs/redditapify/client.go | 15 ++++---- internal/jobs/redditapify/client_test.go | 3 +- internal/jobs/stats/stats.go | 3 +- internal/jobs/tiktok.go | 1 + internal/jobs/tiktokapify/client.go | 2 ++ internal/jobs/twitter.go | 2 +- internal/jobs/twitterapify/client.go | 1 + internal/jobs/web.go | 1 + internal/jobs/web_test.go | 2 +- internal/jobs/webapify/client.go | 2 ++ internal/jobs/webapify/client_test.go | 2 +- 21 files changed, 89 insertions(+), 65 deletions(-) diff --git a/api/types/jobs.go b/api/types/jobs.go index d9a49b0..a48dfa2 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -230,3 +230,14 @@ type JobRequest struct { type JobError struct { Error string `json:"error"` } + +// Key represents a key request +type Key struct { + Key string `json:"key"` + Signature string `json:"signature"` +} + +// KeyResponse represents a response to a key operation +type KeyResponse struct { + Status string `json:"status"` +} diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 3eeeedf..a946535 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -10,6 +10,7 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" + "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/api" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/api/routes.go b/internal/api/routes.go index f8efcb9..7fe1b66 100644 --- a/internal/api/routes.go +++ b/internal/api/routes.go @@ -9,6 +9,7 @@ import ( "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/jobserver" "github.com/masa-finance/tee-worker/pkg/tee" + "github.com/sirupsen/logrus" ) @@ -85,7 +86,7 @@ func status(jobServer *jobserver.JobServer) func(c echo.Context) error { return c.JSON(http.StatusInternalServerError, types.JobError{Error: res.Error}) } - sealedData, err := res.Seal() + sealedData, err := teejob.SealJobResult(&res) if err != nil { logrus.Errorf("Error while sealing status response for job %s: %s", res.Job.UUID, err) return c.JSON(http.StatusInternalServerError, types.JobError{Error: err.Error()}) diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index 3d97de8..b1fa4e9 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -6,11 +6,12 @@ import ( "maps" - util "github.com/masa-finance/tee-worker/pkg/util" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/twitter" "github.com/masa-finance/tee-worker/pkg/client" + util "github.com/masa-finance/tee-worker/pkg/util" "github.com/sirupsen/logrus" ) diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index 376fc7d..877892e 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -7,6 +7,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" ) diff --git a/internal/jobs/linkedin.go b/internal/jobs/linkedin.go index 83d30ea..4175fff 100644 --- a/internal/jobs/linkedin.go +++ b/internal/jobs/linkedin.go @@ -7,6 +7,7 @@ import ( "github.com/sirupsen/logrus" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/linkedinapify" diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index 364a507..c367e2b 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -5,6 +5,8 @@ import ( "errors" "fmt" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 9a85f8a..bdd601d 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -10,11 +10,12 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/llmapify" "github.com/masa-finance/tee-worker/pkg/client" - ) // MockApifyClient is a mock implementation of the ApifyClient. @@ -66,15 +67,15 @@ var _ = Describe("LLMApifyClient", func() { Describe("Process", func() { It("should construct the correct actor input", func() { - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } // Marshal and unmarshal to apply defaults - jsonData, err := json.Marshal(args) + jsonData, err := json.Marshal(llmArgs) Expect(err).ToNot(HaveOccurred()) - err = json.Unmarshal(jsonData, &args) + err = json.Unmarshal(jsonData, &llmArgs) Expect(err).ToNot(HaveOccurred()) mockClient.RunActorAndGetResponseFunc = func(actorID apify.ActorId, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { @@ -86,7 +87,7 @@ var _ = Describe("LLMApifyClient", func() { Expect(ok).To(BeTrue()) Expect(request.InputDatasetId).To(Equal("test-dataset-id")) Expect(request.Prompt).To(Equal("test-prompt")) - Expect(request.LLMProviderApiKey).To(Equal("test-claude-llm-key")) // should be set from constructor + Expect(request.LLMProviderApiKey).To(Equal("test-claude-llm-key")) // should be set from constructor Expect(request.Model).To(Equal(args.LLMDefaultClaudeModel)) // default model Expect(request.MultipleColumns).To(Equal(args.LLMDefaultMultipleColumns)) // default value Expect(request.MaxTokens).To(Equal(args.LLMDefaultMaxTokens)) // default value @@ -95,7 +96,7 @@ var _ = Describe("LLMApifyClient", func() { return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } - _, _, processErr := llmClient.Process("test-worker", args, client.EmptyCursor) + _, _, processErr := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(processErr).NotTo(HaveOccurred()) }) @@ -105,11 +106,11 @@ var _ = Describe("LLMApifyClient", func() { return nil, "", expectedErr } - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } - _, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + _, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).To(MatchError(expectedErr)) }) @@ -124,11 +125,11 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } - results, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + results, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(BeEmpty()) // The invalid item should be skipped }) @@ -146,11 +147,11 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } - results, cursor, err := llmClient.Process("test-worker", args, client.EmptyCursor) + results, cursor, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(cursor).To(Equal(client.Cursor("next"))) Expect(results).To(HaveLen(1)) @@ -173,11 +174,11 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } - results, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + results, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(HaveLen(2)) Expect(results[0].LLMResponse).To(Equal("First summary.")) @@ -185,7 +186,7 @@ var _ = Describe("LLMApifyClient", func() { }) It("should use custom values when provided", func() { - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", MaxTokens: 500, @@ -202,7 +203,7 @@ var _ = Describe("LLMApifyClient", func() { return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } - _, _, err := llmClient.Process("test-worker", args, client.EmptyCursor) + _, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) }) }) @@ -256,17 +257,17 @@ var _ = Describe("LLMApifyClient", func() { realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: config.LlmApiKey(geminiKey)}, nil) Expect(err).NotTo(HaveOccurred()) - args := args.LLMProcessorArguments{ + llmArgs := args.LLMProcessorArguments{ DatasetId: "V6tyuuZIgfiETl1cl", Prompt: "summarize the content of this webpage ${markdown}", } // Marshal and unmarshal to apply defaults - jsonData, err := json.Marshal(args) + jsonData, err := json.Marshal(llmArgs) Expect(err).ToNot(HaveOccurred()) - err = json.Unmarshal(jsonData, &args) + err = json.Unmarshal(jsonData, &llmArgs) Expect(err).ToNot(HaveOccurred()) - results, cursor, err := realClient.Process("test-worker", args, client.EmptyCursor) + results, cursor, err := realClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).NotTo(BeEmpty()) Expect(results[0]).NotTo(BeNil()) diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index 7427332..a3f9039 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -9,22 +9,21 @@ import ( "github.com/sirupsen/logrus" - "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - ) // RedditApifyClient defines the interface for the Reddit Apify client. // This allows for mocking in tests. type RedditApifyClient interface { - ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) - SearchPosts(workerID string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) - SearchCommunities(workerID string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) - SearchUsers(workerID string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) + ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + SearchPosts(workerID string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + SearchCommunities(workerID string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + SearchUsers(workerID string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) } // NewRedditApifyClient is a function variable that can be replaced in tests. @@ -103,7 +102,7 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } } -func processRedditResponse(j types.Job, resp []*reddit.Response, cursor client.Cursor, err error) (types.JobResult, error) { +func processRedditResponse(j types.Job, resp []*types.RedditItem, cursor client.Cursor, err error) (types.JobResult, error) { if err != nil { return types.JobResult{Error: fmt.Sprintf("error while scraping Reddit: %s", err.Error())}, fmt.Errorf("error scraping Reddit: %w", err) } diff --git a/internal/jobs/reddit_test.go b/internal/jobs/reddit_test.go index e326ed8..7a572ad 100644 --- a/internal/jobs/reddit_test.go +++ b/internal/jobs/reddit_test.go @@ -9,25 +9,23 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - ) // MockRedditApifyClient is a mock implementation of the RedditApifyClient. type MockRedditApifyClient struct { - ScrapeUrlsFunc func(urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) - SearchPostsFunc func(queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) - SearchCommunitiesFunc func(queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) - SearchUsersFunc func(queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) + ScrapeUrlsFunc func(urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + SearchPostsFunc func(queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + SearchCommunitiesFunc func(queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + SearchUsersFunc func(queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) } -func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { if m != nil && m.ScrapeUrlsFunc != nil { res, cursor, err := m.ScrapeUrlsFunc(urls, after, args, cursor, maxResults) for i, r := range res { @@ -38,21 +36,21 @@ func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL return nil, "", nil } -func (m *MockRedditApifyClient) SearchPosts(_ string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (m *MockRedditApifyClient) SearchPosts(_ string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { if m != nil && m.SearchPostsFunc != nil { return m.SearchPostsFunc(queries, after, args, cursor, maxResults) } return nil, "", nil } -func (m *MockRedditApifyClient) SearchCommunities(_ string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (m *MockRedditApifyClient) SearchCommunities(_ string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { if m != nil && m.SearchCommunitiesFunc != nil { return m.SearchCommunitiesFunc(queries, args, cursor, maxResults) } return nil, "", nil } -func (m *MockRedditApifyClient) SearchUsers(_ string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (m *MockRedditApifyClient) SearchUsers(_ string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { if m != nil && m.SearchUsersFunc != nil { return m.SearchUsersFunc(queries, skipPosts, args, cursor, maxResults) } @@ -103,16 +101,16 @@ var _ = Describe("RedditScraper", func() { "urls": testUrls, } - mockClient.ScrapeUrlsFunc = func(urls []types.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { + mockClient.ScrapeUrlsFunc = func(urls []types.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { Expect(urls).To(HaveLen(1)) Expect(urls[0].URL).To(Equal(testUrls[0])) - return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.UserResponse}, User: &reddit.User{ID: "user1", DataType: string(reddit.UserResponse)}}}, "next", nil + return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserItem}, User: &types.RedditUser{ID: "user1", DataType: string(types.RedditUserItem)}}}, "next", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next")) - var resp []*reddit.Response + var resp []*types.RedditItem err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -127,15 +125,15 @@ var _ = Describe("RedditScraper", func() { "queries": []string{"user-query"}, } - mockClient.SearchUsersFunc = func(queries []string, skipPosts bool, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { + mockClient.SearchUsersFunc = func(queries []string, skipPosts bool, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { Expect(queries).To(Equal([]string{"user-query"})) - return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.UserResponse}, User: &reddit.User{ID: "user2", DataType: string(reddit.UserResponse)}}}, "next-user", nil + return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserItem}, User: &types.RedditUser{ID: "user2", DataType: string(types.RedditUserItem)}}}, "next-user", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-user")) - var resp []*reddit.Response + var resp []*types.RedditItem err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -150,15 +148,15 @@ var _ = Describe("RedditScraper", func() { "queries": []string{"post-query"}, } - mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { + mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { Expect(queries).To(Equal([]string{"post-query"})) - return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.PostResponse}, Post: &reddit.Post{ID: "post1", DataType: string(reddit.PostResponse)}}}, "next-post", nil + return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditPostItem}, Post: &types.RedditPost{ID: "post1", DataType: string(types.RedditPostItem)}}}, "next-post", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-post")) - var resp []*reddit.Response + var resp []*types.RedditItem err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -173,15 +171,15 @@ var _ = Describe("RedditScraper", func() { "queries": []string{"community-query"}, } - mockClient.SearchCommunitiesFunc = func(queries []string, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { + mockClient.SearchCommunitiesFunc = func(queries []string, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { Expect(queries).To(Equal([]string{"community-query"})) - return []*reddit.Response{{TypeSwitch: &reddit.TypeSwitch{Type: reddit.CommunityResponse}, Community: &reddit.Community{ID: "comm1", DataType: string(reddit.CommunityResponse)}}}, "next-comm", nil + return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditCommunityItem}, Community: &types.RedditCommunity{ID: "comm1", DataType: string(types.RedditCommunityItem)}}}, "next-comm", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-comm")) - var resp []*reddit.Response + var resp []*types.RedditItem err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -208,7 +206,7 @@ var _ = Describe("RedditScraper", func() { } expectedErr := errors.New("client error") - mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { + mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { return nil, "", expectedErr } diff --git a/internal/jobs/redditapify/client.go b/internal/jobs/redditapify/client.go index 07dc090..4996062 100644 --- a/internal/jobs/redditapify/client.go +++ b/internal/jobs/redditapify/client.go @@ -9,7 +9,6 @@ import ( "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" - "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" @@ -101,7 +100,7 @@ func (c *RedditApifyClient) ValidateApiKey() error { } // ScrapeUrls scrapes Reddit URLs -func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { input := args.ToActorRequest() input.StartUrls = urls input.Searches = nil @@ -118,7 +117,7 @@ func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStart } // SearchPosts searches Reddit posts -func (c *RedditApifyClient) SearchPosts(workerID string, queries []string, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (c *RedditApifyClient) SearchPosts(workerID string, queries []string, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { input := args.ToActorRequest() input.Searches = queries input.StartUrls = nil @@ -135,7 +134,7 @@ func (c *RedditApifyClient) SearchPosts(workerID string, queries []string, after } // SearchCommunities searches Reddit communities -func (c *RedditApifyClient) SearchCommunities(workerID string, queries []string, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (c *RedditApifyClient) SearchCommunities(workerID string, queries []string, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { input := args.ToActorRequest() input.Searches = queries input.StartUrls = nil @@ -146,7 +145,7 @@ func (c *RedditApifyClient) SearchCommunities(workerID string, queries []string, } // SearchUsers searches Reddit users -func (c *RedditApifyClient) SearchUsers(workerID string, queries []string, skipPosts bool, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*reddit.Response, client.Cursor, error) { +func (c *RedditApifyClient) SearchUsers(workerID string, queries []string, skipPosts bool, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { input := args.ToActorRequest() input.Searches = queries input.StartUrls = nil @@ -158,7 +157,7 @@ func (c *RedditApifyClient) SearchUsers(workerID string, queries []string, skipP } // getProfiles runs the actor and retrieves profiles from the dataset -func (c *RedditApifyClient) queryReddit(workerID string, input RedditActorRequest, cursor client.Cursor, limit uint) ([]*reddit.Response, client.Cursor, error) { +func (c *RedditApifyClient) queryReddit(workerID string, input RedditActorRequest, cursor client.Cursor, limit uint) ([]*types.RedditItem, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.RedditQueries, 1) } @@ -171,9 +170,9 @@ func (c *RedditApifyClient) queryReddit(workerID string, input RedditActorReques return nil, client.EmptyCursor, err } - response := make([]*reddit.Response, 0, len(dataset.Data.Items)) + response := make([]*types.RedditItem, 0, len(dataset.Data.Items)) for i, item := range dataset.Data.Items { - var resp reddit.Response + var resp types.RedditItem if err := json.Unmarshal(item, &resp); err != nil { logrus.Warnf("Failed to unmarshal profile at index %d: %v", i, err) continue diff --git a/internal/jobs/redditapify/client_test.go b/internal/jobs/redditapify/client_test.go index c68577b..d75a804 100644 --- a/internal/jobs/redditapify/client_test.go +++ b/internal/jobs/redditapify/client_test.go @@ -8,10 +8,11 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" "github.com/masa-finance/tee-worker/pkg/client" - ) // MockApifyClient is a mock implementation of the ApifyClient. diff --git a/internal/jobs/stats/stats.go b/internal/jobs/stats/stats.go index 27cd6cb..85f1bd5 100644 --- a/internal/jobs/stats/stats.go +++ b/internal/jobs/stats/stats.go @@ -5,6 +5,7 @@ import ( "sync" "time" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/versioning" @@ -65,7 +66,7 @@ type Stats struct { CurrentTimeUnix int64 `json:"current_time"` WorkerID string `json:"worker_id"` Stats map[string]map[StatType]uint `json:"stats"` - ReportedCapabilities types.WorkerCapabilities `json:"reported_capabilities"` + ReportedCapabilities types.WorkerCapabilities `json:"reported_capabilities"` WorkerVersion string `json:"worker_version"` ApplicationVersion string `json:"application_version"` sync.Mutex diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index 4c9002c..f3fe2f2 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" diff --git a/internal/jobs/tiktokapify/client.go b/internal/jobs/tiktokapify/client.go index ff531a5..75139a5 100644 --- a/internal/jobs/tiktokapify/client.go +++ b/internal/jobs/tiktokapify/client.go @@ -4,6 +4,8 @@ import ( "encoding/json" "fmt" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" ) diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index 13e2ea1..e3a1991 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -8,10 +8,10 @@ import ( "strings" "time" - "github.com/masa-finance/tee-worker/internal/jobs/twitterx" "github.com/masa-finance/tee-worker/pkg/client" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" diff --git a/internal/jobs/twitterapify/client.go b/internal/jobs/twitterapify/client.go index ccc411b..27356dd 100644 --- a/internal/jobs/twitterapify/client.go +++ b/internal/jobs/twitterapify/client.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" + "github.com/masa-finance/tee-worker/api/types" util "github.com/masa-finance/tee-worker/pkg/util" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 582196a..9cbc47e 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -7,6 +7,7 @@ import ( "github.com/sirupsen/logrus" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/llmapify" diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index e3ceda5..fc8ab79 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -8,6 +8,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" @@ -15,7 +16,6 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - ) // MockWebApifyClient is a mock implementation of the WebApifyClient. diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index 866238f..4d61666 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -4,6 +4,8 @@ import ( "encoding/json" "fmt" + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index 65c7f19..32d5ff0 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -8,10 +8,10 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" - ) // MockApifyClient is a mock implementation of the ApifyClient. From 1e4212b489be34959aa81209b6a9dbd91b662f62 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 21:39:05 +0200 Subject: [PATCH 113/136] fix: duplicate gh test workflows --- .github/workflows/tests.yaml | 67 ------------------------------------ 1 file changed, 67 deletions(-) delete mode 100644 .github/workflows/tests.yaml diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml deleted file mode 100644 index 3e6b924..0000000 --- a/.github/workflows/tests.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: Run Go Tests - -on: - push: - branches: - - '**' - pull_request: - branches: - - '**' - -jobs: - lint: - runs-on: ubuntu-latest - - steps: - - name: Install golangci-lint - run: sudo snap install golangci-lint --classic - - - name: Checkout code - uses: actions/checkout@v2 - - - name: Setup Golang with cache - uses: magnetikonline/action-golang-cache@v5 - with: - go-version-file: go.mod - - - name: Install dependencies - run: | - go mod tidy - - - name: Run tests - run: | - go mod tidy && git diff --exit-code - go mod download - go mod verify - gofmt -s -w . && git diff --exit-code - go vet ./... - golangci-lint run - - test: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Setup Golang with cache - uses: magnetikonline/action-golang-cache@v5 - with: - go-version-file: go.mod - - - name: Run unit tests - run: | - go test ./... - - ready-to-merge: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Verify that merging is OK - run: | - if grep -rE 'DO[ ]NOT[ ]MERGE|[ ]FIXME' .; then - exit 1 - fi From a5a50666442ba5c0ed8dc650373bb712bfa5791d Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 21:55:04 +0200 Subject: [PATCH 114/136] chore: cleanup reddit after refactor --- Makefile | 4 +- api/types/reddit.go | 86 +++++++++++++++++------------ api/types/reddit_test.go | 18 +++--- internal/jobs/reddit.go | 10 ++-- internal/jobs/reddit_test.go | 42 +++++++------- internal/jobs/redditapify/client.go | 14 ++--- 6 files changed, 96 insertions(+), 78 deletions(-) diff --git a/Makefile b/Makefile index 99f57d4..978ef8a 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,9 @@ test-tiktok: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/tiktok_test.go ./internal/jobs/jobs_suite_test.go test-reddit: docker-build-test - @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/reddit_test.go ./internal/jobs/redditapify/client_test.go ./api/types/reddit/reddit_suite_test.go + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/reddit_test.go ./internal/jobs/jobs_suite_test.go" + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/redditapify" + @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./api/types/reddit_test.go" test-web: docker-build-test @docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) sh -c "cd /app && go test -v ./internal/jobs/web_test.go ./internal/jobs/jobs_suite_test.go" diff --git a/api/types/reddit.go b/api/types/reddit.go index f9664fa..f342b42 100644 --- a/api/types/reddit.go +++ b/api/types/reddit.go @@ -123,68 +123,84 @@ type RedditCommunity struct { DataType string `json:"dataType"` } -type RedditTypeSwitch struct { - Type RedditItemType `json:"type"` +// RedditResponse represents a Reddit API response that can be any of the Reddit item types +type RedditResponse struct { + Type RedditItemType `json:"type"` + User *RedditUser `json:"user,omitempty"` + Post *RedditPost `json:"post,omitempty"` + Comment *RedditComment `json:"comment,omitempty"` + Community *RedditCommunity `json:"community,omitempty"` } -type RedditItem struct { - TypeSwitch *RedditTypeSwitch - User *RedditUser - Post *RedditPost - Comment *RedditComment - Community *RedditCommunity -} +// UnmarshalJSON implements custom JSON unmarshaling for RedditResponse +func (r *RedditResponse) UnmarshalJSON(data []byte) error { + // First, unmarshal into a map to get the type + var raw map[string]json.RawMessage + if err := json.Unmarshal(data, &raw); err != nil { + return err + } -func (t *RedditItem) UnmarshalJSON(data []byte) error { - t.TypeSwitch = &RedditTypeSwitch{} - if err := json.Unmarshal(data, &t.TypeSwitch); err != nil { - return fmt.Errorf("failed to unmarshal reddit response type: %w", err) + // Get the type field (check both 'type' and 'dataType' for compatibility) + var itemType RedditItemType + if typeData, exists := raw["type"]; exists { + if err := json.Unmarshal(typeData, &itemType); err != nil { + return fmt.Errorf("failed to unmarshal reddit response type: %w", err) + } + } else if typeData, exists := raw["dataType"]; exists { + if err := json.Unmarshal(typeData, &itemType); err != nil { + return fmt.Errorf("failed to unmarshal reddit response dataType: %w", err) + } + } else { + return fmt.Errorf("missing 'type' or 'dataType' field in reddit response") } - switch t.TypeSwitch.Type { + r.Type = itemType + + // Unmarshal the appropriate struct based on type + switch itemType { case RedditUserItem: - t.User = &RedditUser{} - if err := json.Unmarshal(data, t.User); err != nil { + r.User = &RedditUser{} + if err := json.Unmarshal(data, r.User); err != nil { return fmt.Errorf("failed to unmarshal reddit user: %w", err) } case RedditPostItem: - t.Post = &RedditPost{} - if err := json.Unmarshal(data, t.Post); err != nil { + r.Post = &RedditPost{} + if err := json.Unmarshal(data, r.Post); err != nil { return fmt.Errorf("failed to unmarshal reddit post: %w", err) } case RedditCommentItem: - t.Comment = &RedditComment{} - if err := json.Unmarshal(data, t.Comment); err != nil { + r.Comment = &RedditComment{} + if err := json.Unmarshal(data, r.Comment); err != nil { return fmt.Errorf("failed to unmarshal reddit comment: %w", err) } case RedditCommunityItem: - t.Community = &RedditCommunity{} - if err := json.Unmarshal(data, t.Community); err != nil { + r.Community = &RedditCommunity{} + if err := json.Unmarshal(data, r.Community); err != nil { return fmt.Errorf("failed to unmarshal reddit community: %w", err) } default: - return fmt.Errorf("unknown Reddit response type: %s", t.TypeSwitch.Type) + return fmt.Errorf("unknown Reddit response type: %s", itemType) } + return nil } -// MarshalJSON implements the json.Marshaller interface for RedditResponse. +// MarshalJSON implements the json.Marshaler interface for RedditResponse. // It unwraps the inner struct (User, Post, Comment, or Community) and marshals it directly. -func (t *RedditItem) MarshalJSON() ([]byte, error) { - if t.TypeSwitch == nil { - return []byte("null"), nil - } - - switch t.TypeSwitch.Type { +func (r *RedditResponse) MarshalJSON() ([]byte, error) { + switch r.Type { case RedditUserItem: - return json.Marshal(t.User) + return json.Marshal(r.User) case RedditPostItem: - return json.Marshal(t.Post) + return json.Marshal(r.Post) case RedditCommentItem: - return json.Marshal(t.Comment) + return json.Marshal(r.Comment) case RedditCommunityItem: - return json.Marshal(t.Community) + return json.Marshal(r.Community) default: - return nil, fmt.Errorf("unknown Reddit response type: %s", t.TypeSwitch.Type) + return nil, fmt.Errorf("unknown Reddit response type: %s", r.Type) } } + +// RedditItem is an alias for RedditResponse for backward compatibility +type RedditItem = RedditResponse diff --git a/api/types/reddit_test.go b/api/types/reddit_test.go index c7adc53..d31934a 100644 --- a/api/types/reddit_test.go +++ b/api/types/reddit_test.go @@ -14,7 +14,7 @@ var _ = Describe("RedditResponse", func() { Describe("Unmarshalling", func() { It("should unmarshal a user response", func() { jsonData := `{"type": "user", "id": "user123", "username": "testuser"}` - var resp types.RedditItem + var resp types.RedditResponse err := json.Unmarshal([]byte(jsonData), &resp) Expect(err).ToNot(HaveOccurred()) Expect(resp.User).ToNot(BeNil()) @@ -25,7 +25,7 @@ var _ = Describe("RedditResponse", func() { It("should unmarshal a post response", func() { jsonData := `{"type": "post", "id": "post123", "title": "Test Post"}` - var resp types.RedditItem + var resp types.RedditResponse err := json.Unmarshal([]byte(jsonData), &resp) Expect(err).ToNot(HaveOccurred()) Expect(resp.Post).ToNot(BeNil()) @@ -36,7 +36,7 @@ var _ = Describe("RedditResponse", func() { It("should return an error for an unknown type", func() { jsonData := `{"type": "unknown", "id": "123"}` - var resp types.RedditItem + var resp types.RedditResponse err := json.Unmarshal([]byte(jsonData), &resp) Expect(err).To(MatchError("unknown Reddit response type: unknown")) }) @@ -45,8 +45,8 @@ var _ = Describe("RedditResponse", func() { Describe("Marshalling", func() { It("should marshal a user response", func() { now := time.Now() - resp := types.RedditItem{ - TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserItem}, + resp := types.RedditResponse{ + Type: types.RedditUserItem, User: &types.RedditUser{ ID: "user123", Username: "testuser", @@ -65,8 +65,8 @@ var _ = Describe("RedditResponse", func() { }) It("should marshal a post response", func() { - resp := types.RedditItem{ - TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditPostItem}, + resp := types.RedditResponse{ + Type: types.RedditPostItem, Post: &types.RedditPost{ ID: "post123", Title: "Test Post", @@ -83,8 +83,8 @@ var _ = Describe("RedditResponse", func() { }) It("should return an error for an unknown type", func() { - resp := types.RedditItem{ - TypeSwitch: &types.RedditTypeSwitch{Type: "unknown"}, + resp := types.RedditResponse{ + Type: "unknown", } _, err := json.Marshal(&resp) Expect(err).To(HaveOccurred()) diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index a3f9039..5180d73 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -20,10 +20,10 @@ import ( // RedditApifyClient defines the interface for the Reddit Apify client. // This allows for mocking in tests. type RedditApifyClient interface { - ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) - SearchPosts(workerID string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) - SearchCommunities(workerID string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) - SearchUsers(workerID string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) + SearchPosts(workerID string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) + SearchCommunities(workerID string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) + SearchUsers(workerID string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) } // NewRedditApifyClient is a function variable that can be replaced in tests. @@ -102,7 +102,7 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } } -func processRedditResponse(j types.Job, resp []*types.RedditItem, cursor client.Cursor, err error) (types.JobResult, error) { +func processRedditResponse(j types.Job, resp []*types.RedditResponse, cursor client.Cursor, err error) (types.JobResult, error) { if err != nil { return types.JobResult{Error: fmt.Sprintf("error while scraping Reddit: %s", err.Error())}, fmt.Errorf("error scraping Reddit: %w", err) } diff --git a/internal/jobs/reddit_test.go b/internal/jobs/reddit_test.go index 7a572ad..7de1d02 100644 --- a/internal/jobs/reddit_test.go +++ b/internal/jobs/reddit_test.go @@ -19,13 +19,13 @@ import ( // MockRedditApifyClient is a mock implementation of the RedditApifyClient. type MockRedditApifyClient struct { - ScrapeUrlsFunc func(urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) - SearchPostsFunc func(queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) - SearchCommunitiesFunc func(queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) - SearchUsersFunc func(queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) + ScrapeUrlsFunc func(urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) + SearchPostsFunc func(queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) + SearchCommunitiesFunc func(queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) + SearchUsersFunc func(queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) } -func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { if m != nil && m.ScrapeUrlsFunc != nil { res, cursor, err := m.ScrapeUrlsFunc(urls, after, args, cursor, maxResults) for i, r := range res { @@ -36,21 +36,21 @@ func (m *MockRedditApifyClient) ScrapeUrls(_ string, urls []types.RedditStartURL return nil, "", nil } -func (m *MockRedditApifyClient) SearchPosts(_ string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (m *MockRedditApifyClient) SearchPosts(_ string, queries []string, after time.Time, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { if m != nil && m.SearchPostsFunc != nil { return m.SearchPostsFunc(queries, after, args, cursor, maxResults) } return nil, "", nil } -func (m *MockRedditApifyClient) SearchCommunities(_ string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (m *MockRedditApifyClient) SearchCommunities(_ string, queries []string, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { if m != nil && m.SearchCommunitiesFunc != nil { return m.SearchCommunitiesFunc(queries, args, cursor, maxResults) } return nil, "", nil } -func (m *MockRedditApifyClient) SearchUsers(_ string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (m *MockRedditApifyClient) SearchUsers(_ string, queries []string, skipPosts bool, args redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { if m != nil && m.SearchUsersFunc != nil { return m.SearchUsersFunc(queries, skipPosts, args, cursor, maxResults) } @@ -101,16 +101,16 @@ var _ = Describe("RedditScraper", func() { "urls": testUrls, } - mockClient.ScrapeUrlsFunc = func(urls []types.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { + mockClient.ScrapeUrlsFunc = func(urls []types.RedditStartURL, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { Expect(urls).To(HaveLen(1)) Expect(urls[0].URL).To(Equal(testUrls[0])) - return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserItem}, User: &types.RedditUser{ID: "user1", DataType: string(types.RedditUserItem)}}}, "next", nil + return []*types.RedditResponse{{Type: types.RedditUserItem, User: &types.RedditUser{ID: "user1", DataType: string(types.RedditUserItem)}}}, "next", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next")) - var resp []*types.RedditItem + var resp []*types.RedditResponse err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -125,15 +125,15 @@ var _ = Describe("RedditScraper", func() { "queries": []string{"user-query"}, } - mockClient.SearchUsersFunc = func(queries []string, skipPosts bool, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { + mockClient.SearchUsersFunc = func(queries []string, skipPosts bool, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { Expect(queries).To(Equal([]string{"user-query"})) - return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditUserItem}, User: &types.RedditUser{ID: "user2", DataType: string(types.RedditUserItem)}}}, "next-user", nil + return []*types.RedditResponse{{Type: types.RedditUserItem, User: &types.RedditUser{ID: "user2", DataType: string(types.RedditUserItem)}}}, "next-user", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-user")) - var resp []*types.RedditItem + var resp []*types.RedditResponse err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -148,15 +148,15 @@ var _ = Describe("RedditScraper", func() { "queries": []string{"post-query"}, } - mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { + mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { Expect(queries).To(Equal([]string{"post-query"})) - return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditPostItem}, Post: &types.RedditPost{ID: "post1", DataType: string(types.RedditPostItem)}}}, "next-post", nil + return []*types.RedditResponse{{Type: types.RedditPostItem, Post: &types.RedditPost{ID: "post1", DataType: string(types.RedditPostItem)}}}, "next-post", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-post")) - var resp []*types.RedditItem + var resp []*types.RedditResponse err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -171,15 +171,15 @@ var _ = Describe("RedditScraper", func() { "queries": []string{"community-query"}, } - mockClient.SearchCommunitiesFunc = func(queries []string, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { + mockClient.SearchCommunitiesFunc = func(queries []string, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { Expect(queries).To(Equal([]string{"community-query"})) - return []*types.RedditItem{{TypeSwitch: &types.RedditTypeSwitch{Type: types.RedditCommunityItem}, Community: &types.RedditCommunity{ID: "comm1", DataType: string(types.RedditCommunityItem)}}}, "next-comm", nil + return []*types.RedditResponse{{Type: types.RedditCommunityItem, Community: &types.RedditCommunity{ID: "comm1", DataType: string(types.RedditCommunityItem)}}}, "next-comm", nil } result, err := scraper.ExecuteJob(job) Expect(err).NotTo(HaveOccurred()) Expect(result.NextCursor).To(Equal("next-comm")) - var resp []*types.RedditItem + var resp []*types.RedditResponse err = json.Unmarshal(result.Data, &resp) Expect(err).NotTo(HaveOccurred()) Expect(resp).To(HaveLen(1)) @@ -206,7 +206,7 @@ var _ = Describe("RedditScraper", func() { } expectedErr := errors.New("client error") - mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { + mockClient.SearchPostsFunc = func(queries []string, after time.Time, cArgs redditapify.CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { return nil, "", expectedErr } diff --git a/internal/jobs/redditapify/client.go b/internal/jobs/redditapify/client.go index 4996062..16e5190 100644 --- a/internal/jobs/redditapify/client.go +++ b/internal/jobs/redditapify/client.go @@ -100,7 +100,7 @@ func (c *RedditApifyClient) ValidateApiKey() error { } // ScrapeUrls scrapes Reddit URLs -func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStartURL, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { input := args.ToActorRequest() input.StartUrls = urls input.Searches = nil @@ -117,7 +117,7 @@ func (c *RedditApifyClient) ScrapeUrls(workerID string, urls []types.RedditStart } // SearchPosts searches Reddit posts -func (c *RedditApifyClient) SearchPosts(workerID string, queries []string, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (c *RedditApifyClient) SearchPosts(workerID string, queries []string, after time.Time, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { input := args.ToActorRequest() input.Searches = queries input.StartUrls = nil @@ -134,7 +134,7 @@ func (c *RedditApifyClient) SearchPosts(workerID string, queries []string, after } // SearchCommunities searches Reddit communities -func (c *RedditApifyClient) SearchCommunities(workerID string, queries []string, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (c *RedditApifyClient) SearchCommunities(workerID string, queries []string, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { input := args.ToActorRequest() input.Searches = queries input.StartUrls = nil @@ -145,7 +145,7 @@ func (c *RedditApifyClient) SearchCommunities(workerID string, queries []string, } // SearchUsers searches Reddit users -func (c *RedditApifyClient) SearchUsers(workerID string, queries []string, skipPosts bool, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditItem, client.Cursor, error) { +func (c *RedditApifyClient) SearchUsers(workerID string, queries []string, skipPosts bool, args CommonArgs, cursor client.Cursor, maxResults uint) ([]*types.RedditResponse, client.Cursor, error) { input := args.ToActorRequest() input.Searches = queries input.StartUrls = nil @@ -157,7 +157,7 @@ func (c *RedditApifyClient) SearchUsers(workerID string, queries []string, skipP } // getProfiles runs the actor and retrieves profiles from the dataset -func (c *RedditApifyClient) queryReddit(workerID string, input RedditActorRequest, cursor client.Cursor, limit uint) ([]*types.RedditItem, client.Cursor, error) { +func (c *RedditApifyClient) queryReddit(workerID string, input RedditActorRequest, cursor client.Cursor, limit uint) ([]*types.RedditResponse, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.RedditQueries, 1) } @@ -170,9 +170,9 @@ func (c *RedditApifyClient) queryReddit(workerID string, input RedditActorReques return nil, client.EmptyCursor, err } - response := make([]*types.RedditItem, 0, len(dataset.Data.Items)) + response := make([]*types.RedditResponse, 0, len(dataset.Data.Items)) for i, item := range dataset.Data.Items { - var resp types.RedditItem + var resp types.RedditResponse if err := json.Unmarshal(item, &resp); err != nil { logrus.Warnf("Failed to unmarshal profile at index %d: %v", i, err) continue From dcfe3df98cb89c1c5360fdd65571ea9d062a3d88 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 13 Oct 2025 22:01:35 +0200 Subject: [PATCH 115/136] chore: remove .idea --- .idea/.gitignore | 8 -------- .idea/modules.xml | 8 -------- .idea/tee-types.iml | 9 --------- .idea/vcs.xml | 6 ------ 4 files changed, 31 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/modules.xml delete mode 100644 .idea/tee-types.iml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 66531a7..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/tee-types.iml b/.idea/tee-types.iml deleted file mode 100644 index 5e764c4..0000000 --- a/.idea/tee-types.iml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 18bd93ff23ca47daf3ef0e8e31ea8c7e5fe1bfce Mon Sep 17 00:00:00 2001 From: Grant Foster Date: Mon, 13 Oct 2025 13:02:59 -0700 Subject: [PATCH 116/136] Update pkg/util/set.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/util/set.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/util/set.go b/pkg/util/set.go index 33b907d..f86bc31 100644 --- a/pkg/util/set.go +++ b/pkg/util/set.go @@ -60,7 +60,6 @@ func (s *Set[T]) Union(sets ...*Set[T]) *Set[T] { sum := s.Length() for _, ss := range sets { sum = sum + ss.Length() - } ret := make(map[T]struct{}, sum) From b43ad44ffc75134483860139e3cd7ff78d6deb57 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 14 Oct 2025 18:26:33 +0200 Subject: [PATCH 117/136] fix: remove tee dependencies in client --- pkg/client/apify_client.go | 4 ++-- pkg/client/http.go | 10 ++++++++-- pkg/client/http_test.go | 3 +-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pkg/client/apify_client.go b/pkg/client/apify_client.go index fafad74..589ecb3 100644 --- a/pkg/client/apify_client.go +++ b/pkg/client/apify_client.go @@ -331,8 +331,8 @@ func (c *ApifyClient) ValidateApiKey() error { } var ( - ErrActorFailed = errors.New("Actor run failed") - ErrActorAborted = errors.New("Actor run aborted") + ErrActorFailed = errors.New("actor run failed") + ErrActorAborted = errors.New("actor run aborted") ) // runActorAndGetProfiles runs the actor and retrieves profiles from the dataset diff --git a/pkg/client/http.go b/pkg/client/http.go index 0809b73..50ace98 100644 --- a/pkg/client/http.go +++ b/pkg/client/http.go @@ -8,7 +8,6 @@ import ( "net/http" "time" - "github.com/masa-finance/tee-worker/api/tee" "github.com/masa-finance/tee-worker/api/types" ) @@ -19,6 +18,13 @@ type Client struct { HTTPClient *http.Client } +// EncryptedRequest represents an encrypted request/response pair +// note, this is copied from api/tee/encrypted.go to avoid TEE dependencies in client code +type EncryptedRequest struct { + EncryptedResult string `json:"encrypted_result"` + EncryptedRequest string `json:"encrypted_request"` +} + // setAPIKeyHeader sets the API key on the request if configured. func (c *Client) setAPIKeyHeader(req *http.Request) { if c.options != nil && c.options.APIKey != "" { @@ -115,7 +121,7 @@ func (c *Client) SubmitJob(JobSignature JobSignature) (*JobResult, error) { // Decrypt sends the encrypted result to the server to decrypt it. func (c *Client) Decrypt(JobSignature JobSignature, encryptedResult string) (string, error) { - decryptReq := tee.EncryptedRequest{ + decryptReq := EncryptedRequest{ EncryptedResult: encryptedResult, EncryptedRequest: string(JobSignature), } diff --git a/pkg/client/http_test.go b/pkg/client/http_test.go index 0a015e8..e49bc3a 100644 --- a/pkg/client/http_test.go +++ b/pkg/client/http_test.go @@ -5,7 +5,6 @@ import ( "net/http" "net/http/httptest" - teetypes "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types" . "github.com/masa-finance/tee-worker/pkg/client" . "github.com/onsi/ginkgo/v2" @@ -59,7 +58,7 @@ var _ = Describe("Client", func() { Describe("CreateJobSignature", func() { It("should create a job signature successfully", func() { - job := teetypes.Job{Type: "test-job"} + job := types.Job{Type: "test-job"} signature, err := client.CreateJobSignature(job) Expect(err).NotTo(HaveOccurred()) Expect(signature).To(Equal(JobSignature("mock-signature"))) From 07f299b7a830168ced36e4794e695fa674816535 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 15 Oct 2025 20:37:57 +0200 Subject: [PATCH 118/136] feat: args refactor to be consistent --- ...laboration-rules.mdc => collaboration.mdc} | 0 .cursor/rules/testing.mdc | 15 + api/args/args.go | 29 - api/args/base/base.go | 49 ++ api/args/linkedin/linkedin.go | 2 +- api/args/linkedin/profile/profile.go | 53 +- api/args/linkedin/profile/profile_test.go | 180 ++--- api/args/llm.go | 95 --- api/args/llm/llm.go | 7 + api/args/llm/process/process.go | 108 +++ api/args/llm/process/process_suite_test.go | 13 + .../process/process_test.go} | 82 +- api/args/reddit.go | 169 ---- api/args/reddit/reddit.go | 7 + api/args/reddit/search/search.go | 192 +++++ api/args/reddit/search/search_suite_test.go | 13 + .../search/search_test.go} | 141 ++-- api/args/telemetry.go | 16 - api/args/telemetry/telemetry.go | 63 ++ api/args/telemetry/telemetry_suite_test.go | 13 + api/args/telemetry/telemetry_test.go | 113 +++ api/args/tiktok.go | 243 ------ api/args/tiktok/query/query.go | 74 ++ api/args/tiktok/query/query_suite_test.go | 13 + api/args/tiktok/query/query_test.go | 205 +++++ api/args/tiktok/tiktok.go | 11 + .../tiktok/transcription/transcription.go | 127 +++ .../transcription/transcription_suite_test.go | 13 + .../transcription/transcription_test.go | 242 ++++++ api/args/tiktok/trending/trending.go | 122 +++ .../tiktok/trending/trending_suite_test.go | 13 + api/args/tiktok/trending/trending_test.go | 422 ++++++++++ api/args/twitter.go | 121 --- api/args/twitter/search/search.go | 125 +++ api/args/twitter/search/search_suite_test.go | 13 + api/args/twitter/search/search_test.go | 297 +++++++ api/args/twitter/twitter.go | 7 + api/args/unmarshaller.go | 165 ++-- api/args/unmarshaller_test.go | 31 +- api/args/web.go | 112 --- api/args/web/page/page.go | 113 +++ .../page/page_suite_test.go} | 2 +- .../{web_test.go => web/page/page_test.go} | 122 ++- api/args/web/web.go | 7 + api/types/jobs.go | 134 ++-- api/types/linkedin/linkedin.go | 4 +- api/types/reddit.go | 11 +- internal/apify/actors.go | 4 +- internal/capabilities/detector.go | 67 +- internal/capabilities/detector_test.go | 30 +- internal/config/config.go | 6 +- internal/jobs/linkedin.go | 21 +- internal/jobs/linkedin_test.go | 60 +- internal/jobs/linkedinapify/client_test.go | 2 +- internal/jobs/llmapify/client.go | 6 +- internal/jobs/llmapify/client_test.go | 24 +- internal/jobs/reddit.go | 29 +- internal/jobs/reddit_test.go | 12 +- internal/jobs/redditapify/client.go | 6 +- internal/jobs/redditapify/client_test.go | 10 +- internal/jobs/telemetry.go | 7 - internal/jobs/telemetry_test.go | 11 +- internal/jobs/tiktok.go | 29 +- internal/jobs/tiktok_test.go | 8 +- internal/jobs/tiktokapify/client.go | 7 +- internal/jobs/twitter.go | 752 ++---------------- internal/jobs/twitter_test.go | 107 +-- internal/jobs/web.go | 27 +- internal/jobs/web_test.go | 35 +- internal/jobs/webapify/client.go | 6 +- internal/jobs/webapify/client_test.go | 12 +- internal/jobserver/jobserver.go | 39 +- internal/jobserver/worker.go | 1 - 73 files changed, 3023 insertions(+), 2404 deletions(-) rename .cursor/rules/{collaboration-rules.mdc => collaboration.mdc} (100%) create mode 100644 .cursor/rules/testing.mdc delete mode 100644 api/args/args.go create mode 100644 api/args/base/base.go delete mode 100644 api/args/llm.go create mode 100644 api/args/llm/llm.go create mode 100644 api/args/llm/process/process.go create mode 100644 api/args/llm/process/process_suite_test.go rename api/args/{llm_test.go => llm/process/process_test.go} (59%) delete mode 100644 api/args/reddit.go create mode 100644 api/args/reddit/reddit.go create mode 100644 api/args/reddit/search/search.go create mode 100644 api/args/reddit/search/search_suite_test.go rename api/args/{reddit_test.go => reddit/search/search_test.go} (50%) delete mode 100644 api/args/telemetry.go create mode 100644 api/args/telemetry/telemetry.go create mode 100644 api/args/telemetry/telemetry_suite_test.go create mode 100644 api/args/telemetry/telemetry_test.go delete mode 100644 api/args/tiktok.go create mode 100644 api/args/tiktok/query/query.go create mode 100644 api/args/tiktok/query/query_suite_test.go create mode 100644 api/args/tiktok/query/query_test.go create mode 100644 api/args/tiktok/tiktok.go create mode 100644 api/args/tiktok/transcription/transcription.go create mode 100644 api/args/tiktok/transcription/transcription_suite_test.go create mode 100644 api/args/tiktok/transcription/transcription_test.go create mode 100644 api/args/tiktok/trending/trending.go create mode 100644 api/args/tiktok/trending/trending_suite_test.go create mode 100644 api/args/tiktok/trending/trending_test.go delete mode 100644 api/args/twitter.go create mode 100644 api/args/twitter/search/search.go create mode 100644 api/args/twitter/search/search_suite_test.go create mode 100644 api/args/twitter/search/search_test.go create mode 100644 api/args/twitter/twitter.go delete mode 100644 api/args/web.go create mode 100644 api/args/web/page/page.go rename api/args/{args_suite_test.go => web/page/page_suite_test.go} (90%) rename api/args/{web_test.go => web/page/page_test.go} (54%) create mode 100644 api/args/web/web.go diff --git a/.cursor/rules/collaboration-rules.mdc b/.cursor/rules/collaboration.mdc similarity index 100% rename from .cursor/rules/collaboration-rules.mdc rename to .cursor/rules/collaboration.mdc diff --git a/.cursor/rules/testing.mdc b/.cursor/rules/testing.mdc new file mode 100644 index 0000000..4bacbbf --- /dev/null +++ b/.cursor/rules/testing.mdc @@ -0,0 +1,15 @@ +--- +alwaysApply: true +--- + +## Testing Rule + +**Before writing any test code:** + +1. **Create a Test Plan**: Always develop a clear, detailed plan that outlines: + - What tests will be written + - Which files will be modified or created + - The approach and methodology + - Expected outcomes and impacts + +2. **Prefer Ginkgo & Gomega**: When writing tests, use the Ginkgo and Gomega frameworks (BDD style) where possible to structure and assert tests in Go. Only use the built-in `testing` package for compatibility or legacy reasons, or if instructed. \ No newline at end of file diff --git a/api/args/args.go b/api/args/args.go deleted file mode 100644 index 71b2b31..0000000 --- a/api/args/args.go +++ /dev/null @@ -1,29 +0,0 @@ -package args - -import ( - "encoding/json" - "fmt" - - "github.com/masa-finance/tee-worker/api/args/linkedin" - teetypes "github.com/masa-finance/tee-worker/api/types" -) - -type LinkedInProfileArguments = linkedin.ProfileArguments - -// QueryTypeArgument provides a minimal structure to extract the QueryType (json "type") -// This is used across different job types to determine the specific capability being requested -type QueryTypeArgument struct { - QueryType teetypes.Capability `json:"type"` -} - -// UnmarshalJSON implements custom JSON unmarshaling with normalization -func (q *QueryTypeArgument) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion - type Alias QueryTypeArgument - aux := &struct{ *Alias }{Alias: (*Alias)(q)} - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal QueryType arguments: %w", err) - } - q.QueryType = aux.QueryType - return nil -} diff --git a/api/args/base/base.go b/api/args/base/base.go new file mode 100644 index 0000000..01e3a22 --- /dev/null +++ b/api/args/base/base.go @@ -0,0 +1,49 @@ +package base + +import ( + "encoding/json" + "fmt" + + "github.com/masa-finance/tee-worker/api/types" +) + +// JobArgument defines the interface that all job arguments must implement +type JobArgument interface { + UnmarshalJSON([]byte) error + GetCapability() types.Capability + ValidateCapability(jobType types.JobType) error + SetDefaultValues() + Validate() error +} + +// Verify interface implementation +var _ JobArgument = (*Arguments)(nil) + +type Arguments struct { + Type types.Capability `json:"type"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%v: %w", "failed to unmarshal arguments", err) + } + t.SetDefaultValues() + return t.Validate() +} + +func (a *Arguments) GetCapability() types.Capability { + return a.Type +} + +func (a *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&a.Type) +} + +func (a *Arguments) SetDefaultValues() { +} + +func (a *Arguments) Validate() error { + return nil +} diff --git a/api/args/linkedin/linkedin.go b/api/args/linkedin/linkedin.go index e3ad22c..3a7205f 100644 --- a/api/args/linkedin/linkedin.go +++ b/api/args/linkedin/linkedin.go @@ -4,4 +4,4 @@ import ( "github.com/masa-finance/tee-worker/api/args/linkedin/profile" ) -type ProfileArguments = profile.Arguments +type Profile = profile.Arguments diff --git a/api/args/linkedin/profile/profile.go b/api/args/linkedin/profile/profile.go index 9409404..9d22df6 100644 --- a/api/args/linkedin/profile/profile.go +++ b/api/args/linkedin/profile/profile.go @@ -5,7 +5,8 @@ import ( "errors" "fmt" - teetypes "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types/linkedin/experiences" "github.com/masa-finance/tee-worker/api/types/linkedin/functions" "github.com/masa-finance/tee-worker/api/types/linkedin/industries" @@ -20,6 +21,7 @@ var ( ErrSeniorityNotSupported = errors.New("seniority level not supported") ErrFunctionNotSupported = errors.New("function not supported") ErrIndustryNotSupported = errors.New("industry not supported") + ErrUnmarshalling = errors.New("failed to unmarshal LinkedIn profile arguments") ) const ( @@ -28,9 +30,12 @@ const ( MaxItems = 1000 // 2500 on the actor, but we will run over 1MB memory limit on responses ) +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + // Arguments defines args for LinkedIn profile operations type Arguments struct { - QueryType teetypes.Capability `json:"type"` + Type types.Capability `json:"type"` ScraperMode profile.ScraperMode `json:"profileScraperMode"` Query string `json:"searchQuery"` MaxItems uint `json:"maxItems"` @@ -51,24 +56,17 @@ type Arguments struct { StartPage uint `json:"startPage,omitempty"` } -func (a *Arguments) UnmarshalJSON(data []byte) error { +func (t *Arguments) UnmarshalJSON(data []byte) error { type Alias Arguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(a), - } - + aux := &struct{ *Alias }{Alias: (*Alias)(t)} if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal LinkedIn profile arguments: %w", err) + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) } - - a.setDefaultValues() - - return a.Validate() + t.SetDefaultValues() + return t.Validate() } -func (a *Arguments) setDefaultValues() { +func (a *Arguments) SetDefaultValues() { if a.MaxItems == 0 { a.MaxItems = DefaultMaxItems } @@ -83,9 +81,16 @@ func (a *Arguments) Validate() error { if a.MaxItems > MaxItems { errs = append(errs, ErrMaxItemsTooLarge) } + + err := a.ValidateCapability(types.LinkedInJob) + if err != nil { + errs = append(errs, err) + } + if !profile.AllScraperModes.Contains(a.ScraperMode) { errs = append(errs, ErrScraperModeNotSupported) } + for _, yoe := range a.YearsOfExperience { if !experiences.All.Contains(yoe) { errs = append(errs, fmt.Errorf("%w: %v", ErrExperienceNotSupported, yoe)) @@ -119,14 +124,18 @@ func (a *Arguments) Validate() error { return nil } -func (a *Arguments) GetCapability() teetypes.Capability { - return a.QueryType +func (a *Arguments) GetCapability() types.Capability { + return a.Type } -func (a *Arguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := a.Validate(); err != nil { - return err - } +func (a *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&a.Type) +} - return jobType.ValidateCapability(a.QueryType) +// NewArguments creates a new Arguments instance and applies default values immediately +func NewArguments() Arguments { + args := Arguments{} + args.SetDefaultValues() + args.Validate() // This will set the default capability via ValidateCapability + return args } diff --git a/api/args/linkedin/profile/profile_test.go b/api/args/linkedin/profile/profile_test.go index 5947989..9871880 100644 --- a/api/args/linkedin/profile/profile_test.go +++ b/api/args/linkedin/profile/profile_test.go @@ -7,95 +7,82 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" "github.com/masa-finance/tee-worker/api/args/linkedin/profile" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/api/types/linkedin/experiences" "github.com/masa-finance/tee-worker/api/types/linkedin/functions" "github.com/masa-finance/tee-worker/api/types/linkedin/industries" - profiletypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" + ptypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" "github.com/masa-finance/tee-worker/api/types/linkedin/seniorities" ) var _ = Describe("LinkedIn Profile Arguments", func() { Describe("Marshalling and unmarshalling", func() { It("should set default values", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - } + args := profile.NewArguments() + args.Query = "software engineer" jsonData, err := json.Marshal(args) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &args) Expect(err).ToNot(HaveOccurred()) Expect(args.MaxItems).To(Equal(uint(10))) - Expect(args.ScraperMode).To(Equal(profiletypes.ScraperModeShort)) + Expect(args.ScraperMode).To(Equal(ptypes.ScraperModeShort)) }) It("should override default values", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - MaxItems: 50, - ScraperMode: profiletypes.ScraperModeFull, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.MaxItems = 50 + args.ScraperMode = ptypes.ScraperModeFull jsonData, err := json.Marshal(args) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &args) Expect(err).ToNot(HaveOccurred()) Expect(args.MaxItems).To(Equal(uint(50))) - Expect(args.ScraperMode).To(Equal(profiletypes.ScraperModeFull)) + Expect(args.ScraperMode).To(Equal(ptypes.ScraperModeFull)) }) }) Describe("Validation", func() { It("should succeed with valid arguments", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - YearsOfExperience: []experiences.Id{experiences.ThreeToFiveYears}, - SeniorityLevels: []seniorities.Id{seniorities.Senior}, - Functions: []functions.Id{functions.Engineering}, - Industries: []industries.Id{industries.SoftwareDevelopment}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + args.YearsOfExperience = []experiences.Id{experiences.ThreeToFiveYears} + args.SeniorityLevels = []seniorities.Id{seniorities.Senior} + args.Functions = []functions.Id{functions.Engineering} + args.Industries = []industries.Id{industries.SoftwareDevelopment} err := args.Validate() Expect(err).ToNot(HaveOccurred()) }) It("should fail with max items too large", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 1500, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 1500 err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrMaxItemsTooLarge)).To(BeTrue()) }) It("should fail with invalid scraper mode", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: "InvalidMode", - MaxItems: 10, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = "InvalidMode" + args.MaxItems = 10 err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrScraperModeNotSupported)).To(BeTrue()) }) It("should fail with invalid years of experience", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - YearsOfExperience: []experiences.Id{"invalid"}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + args.YearsOfExperience = []experiences.Id{"invalid"} err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrExperienceNotSupported)).To(BeTrue()) @@ -103,13 +90,11 @@ var _ = Describe("LinkedIn Profile Arguments", func() { }) It("should fail with invalid years at current company", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - YearsAtCurrentCompany: []experiences.Id{"invalid"}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + args.YearsAtCurrentCompany = []experiences.Id{"invalid"} err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrExperienceNotSupported)).To(BeTrue()) @@ -117,26 +102,22 @@ var _ = Describe("LinkedIn Profile Arguments", func() { }) It("should fail with invalid seniority level", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - SeniorityLevels: []seniorities.Id{"invalid"}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + args.SeniorityLevels = []seniorities.Id{"invalid"} err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrSeniorityNotSupported)).To(BeTrue()) }) It("should fail with invalid function", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - Functions: []functions.Id{"invalid"}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + args.Functions = []functions.Id{"invalid"} err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrFunctionNotSupported)).To(BeTrue()) @@ -144,13 +125,11 @@ var _ = Describe("LinkedIn Profile Arguments", func() { }) It("should fail with invalid industry", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - Industries: []industries.Id{"invalid"}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + args.Industries = []industries.Id{"invalid"} err := args.Validate() Expect(err).To(HaveOccurred()) Expect(errors.Is(err, profile.ErrIndustryNotSupported)).To(BeTrue()) @@ -158,14 +137,12 @@ var _ = Describe("LinkedIn Profile Arguments", func() { }) It("should handle multiple validation errors", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: "InvalidMode", - MaxItems: 1500, - YearsOfExperience: []experiences.Id{"invalid"}, - SeniorityLevels: []seniorities.Id{"invalid"}, - } + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = "InvalidMode" + args.MaxItems = 1500 + args.YearsOfExperience = []experiences.Id{"invalid"} + args.SeniorityLevels = []seniorities.Id{"invalid"} err := args.Validate() Expect(err).To(HaveOccurred()) // Should contain multiple error messages @@ -178,44 +155,37 @@ var _ = Describe("LinkedIn Profile Arguments", func() { Describe("GetCapability", func() { It("should return the query type", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - } + args := profile.NewArguments() Expect(args.GetCapability()).To(Equal(types.CapSearchByProfile)) }) }) - Describe("ValidateForJobType", func() { + Describe("ValidateCapability", func() { It("should succeed with valid job type and capability", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - } - err := args.ValidateForJobType(types.LinkedInJob) + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + err := args.ValidateCapability(types.LinkedInJob) Expect(err).ToNot(HaveOccurred()) }) It("should fail with invalid job type", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByQuery, - Query: "software engineer", - ScraperMode: profiletypes.ScraperModeShort, - MaxItems: 10, - } - err := args.ValidateForJobType(types.LinkedInJob) + args := profile.NewArguments() + args.Type = types.CapSearchByQuery // Override the default + args.Query = "software engineer" + args.ScraperMode = ptypes.ScraperModeShort + args.MaxItems = 10 + err := args.ValidateCapability(types.LinkedInJob) Expect(err).To(HaveOccurred()) }) - It("should fail if base validation fails", func() { - args := args.LinkedInProfileArguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - ScraperMode: "InvalidMode", - MaxItems: 10, - } - err := args.ValidateForJobType(types.LinkedInJob) + It("should fail if profile validation fails", func() { + args := profile.NewArguments() + args.Query = "software engineer" + args.ScraperMode = "InvalidMode" + args.MaxItems = 10 + err := args.Validate() Expect(err).To(HaveOccurred()) }) }) diff --git a/api/args/llm.go b/api/args/llm.go deleted file mode 100644 index eaf02ad..0000000 --- a/api/args/llm.go +++ /dev/null @@ -1,95 +0,0 @@ -package args - -import ( - "encoding/json" - "errors" - "fmt" - "strconv" - - "github.com/masa-finance/tee-worker/pkg/util" - teetypes "github.com/masa-finance/tee-worker/api/types" -) - -var ( - ErrLLMDatasetIdRequired = errors.New("dataset id is required") - ErrLLMPromptRequired = errors.New("prompt is required") -) - -const ( - LLMDefaultMaxTokens uint = 300 - LLMDefaultTemperature float64 = 0.1 - LLMDefaultMultipleColumns bool = false - LLMDefaultGeminiModel string = "gemini-1.5-flash-8b" - LLMDefaultClaudeModel string = "claude-3-5-haiku-latest" - LLMDefaultItems uint = 1 -) - -var SupportedModels = util.NewSet(LLMDefaultGeminiModel, LLMDefaultClaudeModel) - -type LLMProcessorArguments struct { - DatasetId string `json:"dataset_id"` - Prompt string `json:"prompt"` - MaxTokens uint `json:"max_tokens"` - Temperature float64 `json:"temperature"` - Items uint `json:"items"` -} - -// UnmarshalJSON implements custom JSON unmarshaling with validation -func (l *LLMProcessorArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias LLMProcessorArguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(l), - } - - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal llm arguments: %w", err) - } - - l.setDefaultValues() - - return l.Validate() -} - -func (l *LLMProcessorArguments) setDefaultValues() { - if l.Temperature == 0 { - l.Temperature = LLMDefaultTemperature - } - if l.MaxTokens == 0 { - l.MaxTokens = LLMDefaultMaxTokens - } - if l.Items == 0 { - l.Items = LLMDefaultItems - } -} - -func (l *LLMProcessorArguments) Validate() error { - if l.DatasetId == "" { - return ErrLLMDatasetIdRequired - } - if l.Prompt == "" { - return ErrLLMPromptRequired - } - return nil -} - -func (l LLMProcessorArguments) ToLLMProcessorRequest(model string, key string) (teetypes.LLMProcessorRequest, error) { - if !SupportedModels.Contains(model) { - return teetypes.LLMProcessorRequest{}, fmt.Errorf("model %s is not supported", model) - } - if key == "" { - return teetypes.LLMProcessorRequest{}, fmt.Errorf("key is required") - } - - return teetypes.LLMProcessorRequest{ - InputDatasetId: l.DatasetId, - LLMProviderApiKey: key, - Prompt: l.Prompt, - MaxTokens: l.MaxTokens, - Temperature: strconv.FormatFloat(l.Temperature, 'f', -1, 64), - MultipleColumns: LLMDefaultMultipleColumns, // overrides default in actor API - Model: model, // overrides default in actor API - }, nil -} diff --git a/api/args/llm/llm.go b/api/args/llm/llm.go new file mode 100644 index 0000000..6dae929 --- /dev/null +++ b/api/args/llm/llm.go @@ -0,0 +1,7 @@ +package llm + +import ( + "github.com/masa-finance/tee-worker/api/args/llm/process" +) + +type Process = process.Arguments diff --git a/api/args/llm/process/process.go b/api/args/llm/process/process.go new file mode 100644 index 0000000..7d08d0d --- /dev/null +++ b/api/args/llm/process/process.go @@ -0,0 +1,108 @@ +package process + +import ( + "encoding/json" + "errors" + "fmt" + "strconv" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/pkg/util" +) + +var ( + ErrDatasetIdRequired = errors.New("dataset id is required") + ErrPromptRequired = errors.New("prompt is required") + ErrUnmarshalling = errors.New("failed to unmarshal arguments") +) + +const ( + DefaultMaxTokens uint = 300 + DefaultTemperature float64 = 0.1 + DefaultMultipleColumns bool = false + DefaultGeminiModel string = "gemini-1.5-flash-8b" + DefaultClaudeModel string = "claude-3-5-haiku-latest" + DefaultItems uint = 1 +) + +var SupportedModels = util.NewSet(DefaultGeminiModel, DefaultClaudeModel) + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +type Arguments struct { + Type types.Capability `json:"type"` + DatasetId string `json:"dataset_id"` + Prompt string `json:"prompt"` + MaxTokens uint `json:"max_tokens"` + Temperature float64 `json:"temperature"` + Items uint `json:"items"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +func (l *Arguments) SetDefaultValues() { + if l.Temperature == 0 { + l.Temperature = DefaultTemperature + } + if l.MaxTokens == 0 { + l.MaxTokens = DefaultMaxTokens + } + if l.Items == 0 { + l.Items = DefaultItems + } +} + +func (l *Arguments) Validate() error { + if l.DatasetId == "" { + return ErrDatasetIdRequired + } + if l.Prompt == "" { + return ErrPromptRequired + } + return nil +} + +func (l *Arguments) GetCapability() types.Capability { + return l.Type +} + +func (l *Arguments) ValidateCapability(jobType types.JobType) error { + return nil // is not yet a standalone job type +} + +// NewArguments creates a new Arguments instance and applies default values immediately +func NewArguments() Arguments { + args := Arguments{} + args.SetDefaultValues() + args.Validate() // This will set the default capability via ValidateCapability + return args +} + +func (l Arguments) ToProcessorRequest(model string, key string) (types.LLMProcessorRequest, error) { + if !SupportedModels.Contains(model) { + return types.LLMProcessorRequest{}, fmt.Errorf("model %s is not supported", model) + } + if key == "" { + return types.LLMProcessorRequest{}, fmt.Errorf("key is required") + } + + return types.LLMProcessorRequest{ + InputDatasetId: l.DatasetId, + LLMProviderApiKey: key, + Prompt: l.Prompt, + MaxTokens: l.MaxTokens, + Temperature: strconv.FormatFloat(l.Temperature, 'f', -1, 64), + MultipleColumns: DefaultMultipleColumns, // overrides default in actor API + Model: model, // overrides default in actor API + }, nil +} diff --git a/api/args/llm/process/process_suite_test.go b/api/args/llm/process/process_suite_test.go new file mode 100644 index 0000000..a485a7f --- /dev/null +++ b/api/args/llm/process/process_suite_test.go @@ -0,0 +1,13 @@ +package process_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/llm_test.go b/api/args/llm/process/process_test.go similarity index 59% rename from api/args/llm_test.go rename to api/args/llm/process/process_test.go index d15e048..c63033c 100644 --- a/api/args/llm_test.go +++ b/api/args/llm/process/process_test.go @@ -1,4 +1,4 @@ -package args_test +package process_test import ( "encoding/json" @@ -7,16 +7,15 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/llm/process" ) var _ = Describe("LLMProcessorArguments", func() { Describe("Marshalling and unmarshalling", func() { It("should set default values", func() { - llmArgs := args.LLMProcessorArguments{ - DatasetId: "ds1", - Prompt: "summarize: ${markdown}", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "ds1" + llmArgs.Prompt = "summarize: ${markdown}" jsonData, err := json.Marshal(llmArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &llmArgs) @@ -27,13 +26,12 @@ var _ = Describe("LLMProcessorArguments", func() { }) It("should override default values", func() { - llmArgs := args.LLMProcessorArguments{ - DatasetId: "ds1", - Prompt: "summarize: ${markdown}", - MaxTokens: 123, - Temperature: 0.7, - Items: 3, - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "ds1" + llmArgs.Prompt = "summarize: ${markdown}" + llmArgs.MaxTokens = 123 + llmArgs.Temperature = 0.7 + llmArgs.Items = 3 jsonData, err := json.Marshal(llmArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &llmArgs) @@ -44,70 +42,66 @@ var _ = Describe("LLMProcessorArguments", func() { }) It("should fail unmarshal when dataset_id is missing", func() { - var llmArgs args.LLMProcessorArguments + var llmArgs process.Arguments jsonData := []byte(`{"type":"datasetprocessor","prompt":"p"}`) err := json.Unmarshal(jsonData, &llmArgs) - Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) + Expect(errors.Is(err, process.ErrDatasetIdRequired)).To(BeTrue()) }) It("should fail unmarshal when prompt is missing", func() { - var llmArgs args.LLMProcessorArguments + var llmArgs process.Arguments jsonData := []byte(`{"type":"datasetprocessor","dataset_id":"ds1"}`) err := json.Unmarshal(jsonData, &llmArgs) - Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) + Expect(errors.Is(err, process.ErrPromptRequired)).To(BeTrue()) }) }) Describe("Validation", func() { It("should succeed with valid arguments", func() { - llmArgs := &args.LLMProcessorArguments{ - DatasetId: "ds1", - Prompt: "p", - MaxTokens: 10, - Temperature: 0.2, - Items: 1, - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "ds1" + llmArgs.Prompt = "p" + llmArgs.MaxTokens = 10 + llmArgs.Temperature = 0.2 + llmArgs.Items = 1 err := llmArgs.Validate() Expect(err).ToNot(HaveOccurred()) }) It("should fail when dataset_id is missing", func() { - llmArgs := &args.LLMProcessorArguments{ - Prompt: "p", - MaxTokens: 10, - Temperature: 0.2, - } + llmArgs := process.NewArguments() + llmArgs.Prompt = "p" + llmArgs.MaxTokens = 10 + llmArgs.Temperature = 0.2 err := llmArgs.Validate() - Expect(errors.Is(err, args.ErrLLMDatasetIdRequired)).To(BeTrue()) + Expect(errors.Is(err, process.ErrDatasetIdRequired)).To(BeTrue()) }) It("should fail when prompt is missing", func() { - llmArgs := &args.LLMProcessorArguments{ - DatasetId: "ds1", - MaxTokens: 10, - Temperature: 0.2, - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "ds1" + llmArgs.MaxTokens = 10 + llmArgs.Temperature = 0.2 err := llmArgs.Validate() - Expect(errors.Is(err, args.ErrLLMPromptRequired)).To(BeTrue()) + Expect(errors.Is(err, process.ErrPromptRequired)).To(BeTrue()) }) }) Describe("ToLLMProcessorRequest", func() { It("should map request fields to actor request fields", func() { - llmArgs := args.LLMProcessorArguments{ - DatasetId: "ds1", - Prompt: "p", - MaxTokens: 42, - Temperature: 0.7, - } - req, err := llmArgs.ToLLMProcessorRequest(args.LLMDefaultGeminiModel, "api-key") + llmArgs := process.NewArguments() + llmArgs.DatasetId = "ds1" + llmArgs.Prompt = "p" + llmArgs.MaxTokens = 42 + llmArgs.Temperature = 0.7 + req, err := llmArgs.ToProcessorRequest(process.DefaultGeminiModel, "api-key") Expect(err).ToNot(HaveOccurred()) Expect(req.InputDatasetId).To(Equal("ds1")) Expect(req.Prompt).To(Equal("p")) Expect(req.MaxTokens).To(Equal(uint(42))) Expect(req.Temperature).To(Equal("0.7")) Expect(req.MultipleColumns).To(BeFalse()) - Expect(req.Model).To(Equal(args.LLMDefaultGeminiModel)) + Expect(req.Model).To(Equal(process.DefaultGeminiModel)) Expect(req.LLMProviderApiKey).To(Equal("api-key")) }) }) diff --git a/api/args/reddit.go b/api/args/reddit.go deleted file mode 100644 index cd024f4..0000000 --- a/api/args/reddit.go +++ /dev/null @@ -1,169 +0,0 @@ -package args - -import ( - "encoding/json" - "errors" - "fmt" - "net/url" - "strings" - "time" - - teetypes "github.com/masa-finance/tee-worker/api/types" -) - -var ( - ErrRedditInvalidType = errors.New("invalid type") - ErrRedditInvalidSort = errors.New("invalid sort") - ErrRedditTimeInTheFuture = errors.New("after field is in the future") - ErrRedditNoQueries = errors.New("queries must be provided for all query types except scrapeurls") - ErrRedditNoUrls = errors.New("urls must be provided for scrapeurls query type") - ErrRedditQueriesNotAllowed = errors.New("the scrapeurls query type does not admit queries") - ErrRedditUrlsNotAllowed = errors.New("urls can only be provided for the scrapeurls query type") -) - -const ( - // These reflect the default values in https://apify.com/trudax/reddit-scraper/input-schema - redditDefaultMaxItems = 10 - redditDefaultMaxPosts = 10 - redditDefaultMaxComments = 10 - redditDefaultMaxCommunities = 2 - redditDefaultMaxUsers = 2 - redditDefaultSort = teetypes.RedditSortNew -) - -const redditDomainSuffix = "reddit.com" - -// RedditArguments defines args for Reddit scrapes -// see https://apify.com/trudax/reddit-scraper -type RedditArguments struct { - QueryType teetypes.RedditQueryType `json:"type"` - Queries []string `json:"queries"` - URLs []string `json:"urls"` - Sort teetypes.RedditSortType `json:"sort"` - IncludeNSFW bool `json:"include_nsfw"` - SkipPosts bool `json:"skip_posts"` // Valid only for searchusers - After time.Time `json:"after"` // valid only for scrapeurls and searchposts - MaxItems uint `json:"max_items"` // Max number of items to scrape (total), default 10 - MaxResults uint `json:"max_results"` // Max number of results per page, default MaxItems - MaxPosts uint `json:"max_posts"` // Max number of posts per page, default 10 - MaxComments uint `json:"max_comments"` // Max number of comments per page, default 10 - MaxCommunities uint `json:"max_communities"` // Max number of communities per page, default 2 - MaxUsers uint `json:"max_users"` // Max number of users per page, default 2 - NextCursor string `json:"next_cursor"` -} - -func (r *RedditArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias RedditArguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(r), - } - - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal Reddit arguments: %w", err) - } - - r.setDefaultValues() - - return r.Validate() -} - -// setDefaultValues sets the default values for the parameters that were not provided and canonicalizes the strings for later validation -func (r *RedditArguments) setDefaultValues() { - if r.MaxItems == 0 { - r.MaxItems = redditDefaultMaxItems - } - if r.MaxPosts == 0 { - r.MaxPosts = redditDefaultMaxPosts - } - if r.MaxComments == 0 { - r.MaxComments = redditDefaultMaxComments - } - if r.MaxCommunities == 0 { - r.MaxCommunities = redditDefaultMaxCommunities - } - if r.MaxUsers == 0 { - r.MaxUsers = redditDefaultMaxUsers - } - if r.MaxResults == 0 { - r.MaxResults = r.MaxItems - } - if r.Sort == "" { - r.Sort = redditDefaultSort - } - - r.QueryType = teetypes.RedditQueryType(strings.ToLower(string(r.QueryType))) - r.Sort = teetypes.RedditSortType(strings.ToLower(string(r.Sort))) -} - -func (r *RedditArguments) Validate() error { - var errs []error - - if !teetypes.AllRedditQueryTypes.Contains(r.QueryType) { - errs = append(errs, ErrRedditInvalidType) - } - - if !teetypes.AllRedditSortTypes.Contains(r.Sort) { - errs = append(errs, ErrRedditInvalidSort) - } - - if time.Now().Before(r.After) { - errs = append(errs, ErrRedditTimeInTheFuture) - } - - if len(errs) > 0 { - return errors.Join(errs...) - } - - if r.QueryType == teetypes.RedditScrapeUrls { - if len(r.URLs) == 0 { - errs = append(errs, ErrRedditNoUrls) - } - if len(r.Queries) > 0 { - errs = append(errs, ErrRedditQueriesNotAllowed) - } - - for _, u := range r.URLs { - u, err := url.Parse(u) - if err != nil { - errs = append(errs, fmt.Errorf("%s is not a valid URL", u)) - } else { - if !strings.HasSuffix(strings.ToLower(u.Host), redditDomainSuffix) { - errs = append(errs, fmt.Errorf("invalid Reddit URL %s", u)) - } - if !strings.HasPrefix(u.Path, "/r/") { - errs = append(errs, fmt.Errorf("%s is not a Reddit post or comment URL (missing /r/)", u)) - } - if !strings.Contains(u.Path, "/comments/") { - errs = append(errs, fmt.Errorf("%s is not a Reddit post or comment URL (missing /comments/)", u)) - } - } - } - } else { - if len(r.Queries) == 0 { - errs = append(errs, ErrRedditNoQueries) - } - if len(r.URLs) > 0 { - errs = append(errs, ErrRedditUrlsNotAllowed) - } - } - - return errors.Join(errs...) -} - -// ValidateForJobType validates Twitter arguments for a specific job type -func (r *RedditArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := r.Validate(); err != nil { - return err - } - - // Validate QueryType against job-specific capabilities - return jobType.ValidateCapability(teetypes.Capability(r.QueryType)) -} - -// GetCapability returns the QueryType as a typed Capability -func (r *RedditArguments) GetCapability() teetypes.Capability { - return teetypes.Capability(r.QueryType) -} diff --git a/api/args/reddit/reddit.go b/api/args/reddit/reddit.go new file mode 100644 index 0000000..ec34b0d --- /dev/null +++ b/api/args/reddit/reddit.go @@ -0,0 +1,7 @@ +package reddit + +import ( + "github.com/masa-finance/tee-worker/api/args/reddit/search" +) + +type Search = search.Arguments diff --git a/api/args/reddit/search/search.go b/api/args/reddit/search/search.go new file mode 100644 index 0000000..686eb24 --- /dev/null +++ b/api/args/reddit/search/search.go @@ -0,0 +1,192 @@ +package search + +import ( + "encoding/json" + "errors" + "fmt" + "net/url" + "strings" + "time" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrInvalidType = errors.New("invalid type") + ErrInvalidSort = errors.New("invalid sort") + ErrTimeInTheFuture = errors.New("after field is in the future") + ErrNoQueries = errors.New("queries must be provided for all query types except scrapeurls") + ErrNoUrls = errors.New("urls must be provided for scrapeurls query type") + ErrQueriesNotAllowed = errors.New("the scrapeurls query type does not admit queries") + ErrUrlsNotAllowed = errors.New("urls can only be provided for the scrapeurls query type") + ErrUnmarshalling = errors.New("failed to unmarshal reddit search arguments") +) + +const ( + // These reflect the default values in https://apify.com/trudax/reddit-scraper/input-schema + DefaultMaxItems = 10 + DefaultMaxPosts = 10 + DefaultMaxComments = 10 + DefaultMaxCommunities = 2 + DefaultMaxUsers = 2 + DefaultSort = types.RedditSortNew +) + +const DomainSuffix = "reddit.com" + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +// Arguments defines args for Reddit scrapes +// see https://apify.com/trudax/reddit-scraper +type Arguments struct { + Type types.Capability `json:"type"` + Queries []string `json:"queries"` + URLs []string `json:"urls"` + Sort types.RedditSortType `json:"sort"` + IncludeNSFW bool `json:"include_nsfw"` + SkipPosts bool `json:"skip_posts"` // Valid only for searchusers + After time.Time `json:"after"` // valid only for scrapeurls and searchposts + MaxItems uint `json:"max_items"` // Max number of items to scrape (total), default 10 + MaxResults uint `json:"max_results"` // Max number of results per page, default MaxItems + MaxPosts uint `json:"max_posts"` // Max number of posts per page, default 10 + MaxComments uint `json:"max_comments"` // Max number of comments per page, default 10 + MaxCommunities uint `json:"max_communities"` // Max number of communities per page, default 2 + MaxUsers uint `json:"max_users"` // Max number of users per page, default 2 + NextCursor string `json:"next_cursor"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +// SetDefaultValues sets the default values for the parameters that were not provided and canonicalizes the strings for later validation +func (r *Arguments) SetDefaultValues() { + if r.MaxItems == 0 { + r.MaxItems = DefaultMaxItems + } + if r.MaxPosts == 0 { + r.MaxPosts = DefaultMaxPosts + } + if r.MaxComments == 0 { + r.MaxComments = DefaultMaxComments + } + if r.MaxCommunities == 0 { + r.MaxCommunities = DefaultMaxCommunities + } + if r.MaxUsers == 0 { + r.MaxUsers = DefaultMaxUsers + } + if r.MaxItems != 0 { + r.MaxResults = r.MaxItems + } else if r.MaxResults == 0 { + r.MaxResults = DefaultMaxItems + } + if r.Sort == "" { + r.Sort = DefaultSort + } + + r.Sort = types.RedditSortType(strings.ToLower(string(r.Sort))) +} + +func (r *Arguments) Validate() error { + var errs []error + + if !types.AllRedditQueryTypes.Contains(r.Type) { + errs = append(errs, ErrInvalidType) + } + + if !types.AllRedditSortTypes.Contains(r.Sort) { + errs = append(errs, ErrInvalidSort) + } + + if time.Now().Before(r.After) { + errs = append(errs, ErrTimeInTheFuture) + } + + if len(errs) > 0 { + return errors.Join(errs...) + } + + if r.Type == types.CapScrapeUrls { + if len(r.URLs) == 0 { + errs = append(errs, ErrNoUrls) + } + if len(r.Queries) > 0 { + errs = append(errs, ErrQueriesNotAllowed) + } + + for _, u := range r.URLs { + u, err := url.Parse(u) + if err != nil { + errs = append(errs, fmt.Errorf("%s is not a valid URL", u)) + } else { + if !strings.HasSuffix(strings.ToLower(u.Host), DomainSuffix) { + errs = append(errs, fmt.Errorf("invalid Reddit URL %s", u)) + } + if !strings.HasPrefix(u.Path, "/r/") { + errs = append(errs, fmt.Errorf("%s is not a Reddit post or comment URL (missing /r/)", u)) + } + if !strings.Contains(u.Path, "/comments/") { + errs = append(errs, fmt.Errorf("%s is not a Reddit post or comment URL (missing /comments/)", u)) + } + } + } + } else { + if len(r.Queries) == 0 { + errs = append(errs, ErrNoQueries) + } + if len(r.URLs) > 0 { + errs = append(errs, ErrUrlsNotAllowed) + } + } + + return errors.Join(errs...) +} + +// GetCapability returns the capability of the arguments +func (r *Arguments) GetCapability() types.Capability { + return r.Type +} + +// ValidateCapability validates the capability of the arguments +func (r *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&r.Type) +} + +// NewArguments creates a new Arguments instance with the specified capability +// and applies default values immediately +func NewArguments(capability types.Capability) Arguments { + args := Arguments{ + Type: capability, + } + args.SetDefaultValues() + return args +} + +// NewSearchPostsArguments creates a new Arguments instance for searching posts +func NewSearchPostsArguments() Arguments { + return NewArguments(types.CapSearchPosts) +} + +// NewSearchUsersArguments creates a new Arguments instance for searching users +func NewSearchUsersArguments() Arguments { + return NewArguments(types.CapSearchUsers) +} + +// NewSearchCommunitiesArguments creates a new Arguments instance for searching communities +func NewSearchCommunitiesArguments() Arguments { + return NewArguments(types.CapSearchCommunities) +} + +// NewScrapeUrlsArguments creates a new Arguments instance for scraping URLs +func NewScrapeUrlsArguments() Arguments { + return NewArguments(types.CapScrapeUrls) +} diff --git a/api/args/reddit/search/search_suite_test.go b/api/args/reddit/search/search_suite_test.go new file mode 100644 index 0000000..688d8ec --- /dev/null +++ b/api/args/reddit/search/search_suite_test.go @@ -0,0 +1,13 @@ +package search_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/reddit_test.go b/api/args/reddit/search/search_test.go similarity index 50% rename from api/args/reddit_test.go rename to api/args/reddit/search/search_test.go index f9775cc..6356649 100644 --- a/api/args/reddit_test.go +++ b/api/args/reddit/search/search_test.go @@ -1,4 +1,4 @@ -package args_test +package search_test import ( "encoding/json" @@ -7,17 +7,15 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/reddit/search" "github.com/masa-finance/tee-worker/api/types" ) var _ = Describe("RedditArguments", func() { Describe("Marshalling and unmarshalling", func() { It("should set default values", func() { - redditArgs := args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Queries: []string{"Zaphod", "Ford"}, - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Queries = []string{"Zaphod", "Ford"} jsonData, err := json.Marshal(redditArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &redditArgs) @@ -32,16 +30,14 @@ var _ = Describe("RedditArguments", func() { }) It("should override default values", func() { - redditArgs := args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Queries: []string{"Zaphod", "Ford"}, - MaxItems: 20, - MaxPosts: 21, - MaxComments: 22, - MaxCommunities: 23, - MaxUsers: 24, - Sort: types.RedditSortTop, - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Queries = []string{"Zaphod", "Ford"} + redditArgs.MaxItems = 20 + redditArgs.MaxPosts = 21 + redditArgs.MaxComments = 22 + redditArgs.MaxCommunities = 23 + redditArgs.MaxUsers = 24 + redditArgs.Sort = types.RedditSortTop jsonData, err := json.Marshal(redditArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &redditArgs) @@ -59,101 +55,86 @@ var _ = Describe("RedditArguments", func() { Describe("Validation", func() { It("should succeed with valid arguments", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Queries: []string{"test"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Queries = []string{"test"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() Expect(err).ToNot(HaveOccurred()) }) It("should succeed with valid scrapeurls arguments", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditScrapeUrls, - URLs: []string{"https://www.reddit.com/r/golang/comments/foo/bar"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewScrapeUrlsArguments() + redditArgs.URLs = []string{"https://www.reddit.com/r/golang/comments/foo/bar"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() Expect(err).ToNot(HaveOccurred()) }) It("should fail with an invalid type", func() { - redditArgs := &args.RedditArguments{ - QueryType: "invalidtype", - Queries: []string{"test"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Type = "invalidtype" // Override the default + redditArgs.Queries = []string{"test"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditInvalidType)) + Expect(err).To(MatchError(search.ErrInvalidType)) }) It("should fail with an invalid sort", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Queries: []string{"test"}, - Sort: "invalidsort", - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Queries = []string{"test"} + redditArgs.Sort = "invalidsort" err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditInvalidSort)) + Expect(err).To(MatchError(search.ErrInvalidSort)) }) It("should fail if the after time is in the future", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Queries: []string{"test"}, - Sort: types.RedditSortNew, - After: time.Now().Add(24 * time.Hour), + redditArgs := &search.Arguments{ + Type: types.CapSearchPosts, + Queries: []string{"test"}, + Sort: types.RedditSortNew, + After: time.Now().Add(24 * time.Hour), } err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditTimeInTheFuture)) + Expect(err).To(MatchError(search.ErrTimeInTheFuture)) }) It("should fail if queries are not provided for searchposts", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Sort: types.RedditSortNew, - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditNoQueries)) + Expect(err).To(MatchError(search.ErrNoQueries)) }) It("should fail if urls are not provided for scrapeurls", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditScrapeUrls, - Sort: types.RedditSortNew, - } + redditArgs := search.NewScrapeUrlsArguments() + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditNoUrls)) + Expect(err).To(MatchError(search.ErrNoUrls)) }) It("should fail if queries are provided for scrapeurls", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditScrapeUrls, - Queries: []string{"test"}, - URLs: []string{"https://www.reddit.com/r/golang/comments/foo/bar/"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewScrapeUrlsArguments() + redditArgs.Queries = []string{"test"} + redditArgs.URLs = []string{"https://www.reddit.com/r/golang/comments/foo/bar/"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditQueriesNotAllowed)) + Expect(err).To(MatchError(search.ErrQueriesNotAllowed)) }) It("should fail if urls are provided for searchposts", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditSearchPosts, - Queries: []string{"test"}, - URLs: []string{"https://www.reddit.com/r/golang/comments/foo/bar"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewSearchPostsArguments() + redditArgs.Queries = []string{"test"} + redditArgs.URLs = []string{"https://www.reddit.com/r/golang/comments/foo/bar"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() - Expect(err).To(MatchError(args.ErrRedditUrlsNotAllowed)) + Expect(err).To(MatchError(search.ErrUrlsNotAllowed)) }) It("should fail with an invalid URL", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditScrapeUrls, - URLs: []string{"ht tp://invalid-url.com"}, - Sort: types.RedditSortNew, + redditArgs := &search.Arguments{ + Type: types.CapScrapeUrls, + URLs: []string{"ht tp://invalid-url.com"}, + Sort: types.RedditSortNew, } err := redditArgs.Validate() Expect(err).To(HaveOccurred()) @@ -161,22 +142,18 @@ var _ = Describe("RedditArguments", func() { }) It("should fail with an invalid domain", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditScrapeUrls, - URLs: []string{"https://www.google.com"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewScrapeUrlsArguments() + redditArgs.URLs = []string{"https://www.google.com"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("invalid Reddit URL")) }) It("should fail if the URL is not a post or comment", func() { - redditArgs := &args.RedditArguments{ - QueryType: types.RedditScrapeUrls, - URLs: []string{"https://www.reddit.com/r/golang/"}, - Sort: types.RedditSortNew, - } + redditArgs := search.NewScrapeUrlsArguments() + redditArgs.URLs = []string{"https://www.reddit.com/r/golang/"} + redditArgs.Sort = types.RedditSortNew err := redditArgs.Validate() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("not a Reddit post or comment URL")) diff --git a/api/args/telemetry.go b/api/args/telemetry.go deleted file mode 100644 index b947204..0000000 --- a/api/args/telemetry.go +++ /dev/null @@ -1,16 +0,0 @@ -package args - -import ( - "github.com/masa-finance/tee-worker/api/types" -) - -// TelemetryJobArguments for telemetry jobs (simple case) -type TelemetryJobArguments struct{} - -func (t *TelemetryJobArguments) Validate() error { - return nil -} - -func (t *TelemetryJobArguments) GetCapability() types.Capability { - return types.CapTelemetry -} diff --git a/api/args/telemetry/telemetry.go b/api/args/telemetry/telemetry.go new file mode 100644 index 0000000..4cc2eb6 --- /dev/null +++ b/api/args/telemetry/telemetry.go @@ -0,0 +1,63 @@ +package telemetry + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrUnmarshalling = errors.New("failed to unmarshal telemetry arguments") +) + +type Telemetry = Arguments + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +// Arguments defines args for Telemetry jobs +type Arguments struct { + Type types.Capability `json:"type"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +func (t *Arguments) SetDefaultValues() { +} + +func (t *Arguments) Validate() error { + err := t.ValidateCapability(types.TelemetryJob) + if err != nil { + return err + } + return nil +} + +// GetCapability returns the capability of the arguments +func (t *Arguments) GetCapability() types.Capability { + return t.Type +} + +// ValidateCapability validates the capability of the arguments +func (t *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&t.Type) +} + +// NewArguments creates a new Arguments instance and applies default values immediately +func NewArguments() Arguments { + args := Arguments{} + args.SetDefaultValues() + args.Validate() // This will set the default capability via ValidateCapability + return args +} diff --git a/api/args/telemetry/telemetry_suite_test.go b/api/args/telemetry/telemetry_suite_test.go new file mode 100644 index 0000000..9daa685 --- /dev/null +++ b/api/args/telemetry/telemetry_suite_test.go @@ -0,0 +1,13 @@ +package telemetry_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/telemetry/telemetry_test.go b/api/args/telemetry/telemetry_test.go new file mode 100644 index 0000000..28dd580 --- /dev/null +++ b/api/args/telemetry/telemetry_test.go @@ -0,0 +1,113 @@ +package telemetry_test + +import ( + "encoding/json" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/api/args/telemetry" + "github.com/masa-finance/tee-worker/api/types" +) + +var _ = Describe("Telemetry Arguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should set default values", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Type).To(Equal(types.CapTelemetry)) + }) + + It("should preserve custom values", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + err = json.Unmarshal([]byte(jsonData), &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Type).To(Equal(types.CapTelemetry)) + }) + + It("should handle invalid JSON", func() { + args := &telemetry.Arguments{} + invalidJSON := `{"type": "telemetry", "invalid": }` + err := json.Unmarshal([]byte(invalidJSON), args) + Expect(err).To(HaveOccurred()) + // The error should be a JSON syntax error, not our custom error + Expect(err).To(BeAssignableToTypeOf(&json.SyntaxError{})) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should succeed with empty arguments", func() { + args := &telemetry.Arguments{} + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + }) + + Describe("GetCapability", func() { + It("should return the telemetry capability", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + Expect(args.GetCapability()).To(Equal(types.CapTelemetry)) + }) + + It("should return empty capability for uninitialized arguments", func() { + args := &telemetry.Arguments{} + Expect(args.GetCapability()).To(Equal(types.Capability(""))) + }) + }) + + Describe("ValidateCapability", func() { + It("should succeed with valid job type and capability", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + err := args.ValidateCapability(types.TelemetryJob) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail with invalid job type", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + err := args.ValidateCapability(types.LinkedInJob) + Expect(err).To(HaveOccurred()) + }) + + It("should fail with invalid capability", func() { + args := &telemetry.Arguments{ + Type: types.CapSearchPosts, // Wrong capability + } + err := args.ValidateCapability(types.TelemetryJob) + Expect(err).To(HaveOccurred()) + }) + }) + + Describe("SetDefaultValues", func() { + It("should not modify arguments", func() { + args := &telemetry.Arguments{ + Type: types.CapTelemetry, + } + originalType := args.Type + args.SetDefaultValues() + Expect(args.Type).To(Equal(originalType)) + }) + }) +}) diff --git a/api/args/tiktok.go b/api/args/tiktok.go deleted file mode 100644 index 37c4aaa..0000000 --- a/api/args/tiktok.go +++ /dev/null @@ -1,243 +0,0 @@ -package args - -import ( - "encoding/json" - "errors" - "fmt" - "net/url" - "strings" - - teetypes "github.com/masa-finance/tee-worker/api/types" -) - -// Period constants for TikTok trending search -const ( - periodWeek string = "7" - periodMonth string = "30" -) - -const ( - sortTrending string = "vv" - sortLike string = "like" - sortComment string = "comment" - sortRepost string = "repost" -) - -// TikTokTranscriptionArguments defines args for TikTok transcriptions -type TikTokTranscriptionArguments struct { - VideoURL string `json:"video_url"` - Language string `json:"language,omitempty"` // e.g., "eng-US" -} - -// UnmarshalJSON implements custom JSON unmarshaling with validation -func (t *TikTokTranscriptionArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias TikTokTranscriptionArguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(t), - } - - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) - } - - return t.Validate() -} - -// Validate validates the TikTok arguments -func (t *TikTokTranscriptionArguments) Validate() error { - if t.VideoURL == "" { - return errors.New("video_url is required") - } - - // Validate URL format - parsedURL, err := url.Parse(t.VideoURL) - if err != nil { - return fmt.Errorf("invalid video_url format: %w", err) - } - - // Basic TikTok URL validation - if !t.IsTikTokURL(parsedURL) { - return errors.New("URL must be a valid TikTok video URL") - } - - // Validate language format if provided - if t.Language != "" { - if err := t.validateLanguageCode(); err != nil { - return err - } - } - - return nil -} - -// GetCapability returns the capability for TikTok operations (always transcription) -func (t *TikTokTranscriptionArguments) GetCapability() teetypes.Capability { - return teetypes.CapTranscription -} - -// IsTikTokURL validates if the URL is a TikTok URL -func (t *TikTokTranscriptionArguments) IsTikTokURL(parsedURL *url.URL) bool { - host := strings.ToLower(parsedURL.Host) - return host == "tiktok.com" || strings.HasSuffix(host, ".tiktok.com") -} - -// HasLanguagePreference returns true if a language preference is specified -func (t *TikTokTranscriptionArguments) HasLanguagePreference() bool { - return t.Language != "" -} - -// GetVideoURL returns the source video URL -func (t *TikTokTranscriptionArguments) GetVideoURL() string { - return t.VideoURL -} - -// GetLanguageCode returns the language code, defaulting to "en-us" if not specified -func (t *TikTokTranscriptionArguments) GetLanguageCode() string { - if t.Language == "" { - return "eng-US" - } - return t.Language -} - -// ValidateForJobType validates TikTok arguments for a specific job type -func (t *TikTokTranscriptionArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := t.Validate(); err != nil { - return err - } - - // Validate capability against job-specific capabilities - return jobType.ValidateCapability(t.GetCapability()) -} - -// validateLanguageCode validates the language code format -func (t *TikTokTranscriptionArguments) validateLanguageCode() error { - // Basic validation for language codes like "en-us", "eng-us", "es-es", etc. - parts := strings.Split(t.Language, "-") - if len(parts) != 2 { - return fmt.Errorf("invalid language format '%s', expected format: 'lang-region' (e.g., 'en-us' or 'eng-us')", t.Language) - } - - // Language code can be 2 or 3 letters, region must be 2 letters - if (len(parts[0]) != 2 && len(parts[0]) != 3) || len(parts[1]) != 2 { - return fmt.Errorf("invalid language format '%s', expected 2-3 letter language code and 2-letter region code", t.Language) - } - - return nil -} - -// TikTokSearchByQueryArguments defines args for epctex/tiktok-search-scraper -type TikTokSearchByQueryArguments struct { - QueryType string `json:"type"` - Search []string `json:"search,omitempty"` - StartUrls []string `json:"start_urls,omitempty"` - MaxItems uint `json:"max_items,omitempty"` - EndPage uint `json:"end_page,omitempty"` -} - -func (t *TikTokSearchByQueryArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias TikTokSearchByQueryArguments - aux := &struct{ *Alias }{Alias: (*Alias)(t)} - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) - } - t.QueryType = strings.ToLower(t.QueryType) - return t.Validate() -} - -func (t *TikTokSearchByQueryArguments) Validate() error { - if len(t.Search) == 0 && len(t.StartUrls) == 0 { - return errors.New("either 'search' or 'start_urls' is required for searchbyquery") - } - return nil -} - -func (t *TikTokSearchByQueryArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := jobType.ValidateCapability(teetypes.CapSearchByQuery); err != nil { - return err - } - return t.Validate() -} - -func (t *TikTokSearchByQueryArguments) GetCapability() teetypes.Capability { - return teetypes.CapSearchByQuery -} - -// TikTokSearchByTrendingArguments defines args for lexis-solutions/tiktok-trending-videos-scraper -type TikTokSearchByTrendingArguments struct { - QueryType string `json:"type"` - CountryCode string `json:"country_code,omitempty"` - SortBy string `json:"sort_by,omitempty"` - MaxItems int `json:"max_items,omitempty"` - Period string `json:"period,omitempty"` -} - -func (t *TikTokSearchByTrendingArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias TikTokSearchByTrendingArguments - aux := &struct{ *Alias }{Alias: (*Alias)(t)} - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal TikTok searchbytrending arguments: %w", err) - } - t.QueryType = strings.ToLower(t.QueryType) - if t.CountryCode == "" { - t.CountryCode = "US" - } - if t.SortBy == "" { - t.SortBy = sortTrending - } - if t.Period == "" { - t.Period = periodWeek - } - return t.Validate() -} - -func (t *TikTokSearchByTrendingArguments) Validate() error { - allowedSorts := map[string]struct{}{ - sortTrending: {}, sortLike: {}, sortComment: {}, sortRepost: {}, - } - - allowedPeriods := map[string]struct{}{ - periodWeek: {}, - periodMonth: {}, - } - - allowedCountries := map[string]struct{}{ - "AU": {}, "BR": {}, "CA": {}, "EG": {}, "FR": {}, "DE": {}, "ID": {}, "IL": {}, "IT": {}, "JP": {}, - "MY": {}, "PH": {}, "RU": {}, "SA": {}, "SG": {}, "KR": {}, "ES": {}, "TW": {}, "TH": {}, "TR": {}, - "AE": {}, "GB": {}, "US": {}, "VN": {}, - } - - if _, ok := allowedCountries[strings.ToUpper(t.CountryCode)]; !ok { - return fmt.Errorf("invalid country_code '%s'", t.CountryCode) - } - if _, ok := allowedSorts[strings.ToLower(t.SortBy)]; !ok { - return fmt.Errorf("invalid sort_by '%s'", t.SortBy) - } - if _, ok := allowedPeriods[t.Period]; !ok { - // Extract keys for error message - var validKeys []string - for key := range allowedPeriods { - validKeys = append(validKeys, key) - } - return fmt.Errorf("invalid period '%s' (allowed: %s)", t.Period, strings.Join(validKeys, ", ")) - } - if t.MaxItems < 0 { - return fmt.Errorf("max_items must be non-negative, got: %d", t.MaxItems) - } - return nil -} - -func (t *TikTokSearchByTrendingArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := jobType.ValidateCapability(teetypes.CapSearchByTrending); err != nil { - return err - } - return t.Validate() -} - -func (t *TikTokSearchByTrendingArguments) GetCapability() teetypes.Capability { - return teetypes.CapSearchByTrending -} diff --git a/api/args/tiktok/query/query.go b/api/args/tiktok/query/query.go new file mode 100644 index 0000000..3733997 --- /dev/null +++ b/api/args/tiktok/query/query.go @@ -0,0 +1,74 @@ +package query + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrSearchOrUrlsRequired = errors.New("either 'search' or 'start_urls' are required") + ErrUnmarshalling = errors.New("failed to unmarshal TikTok searchbyquery arguments") +) + +const ( + DefaultMaxItems = 10 + DefaultType = types.CapSearchByQuery +) + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +type Arguments struct { + Type types.Capability `json:"type"` + Search []string `json:"search,omitempty"` + StartUrls []string `json:"start_urls,omitempty"` + MaxItems uint `json:"max_items,omitempty"` + EndPage uint `json:"end_page,omitempty"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +func (t *Arguments) SetDefaultValues() { + if t.MaxItems == 0 { + t.MaxItems = DefaultMaxItems + } +} + +func (t *Arguments) GetCapability() types.Capability { + return t.Type +} + +func (t *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&t.Type) +} + +func (t *Arguments) Validate() error { + err := t.ValidateCapability(types.TiktokJob) + if err != nil { + return err + } + if len(t.Search) == 0 && len(t.StartUrls) == 0 { + return ErrSearchOrUrlsRequired + } + return nil +} + +func NewArguments() Arguments { + args := Arguments{ + Type: types.CapSearchByQuery, + } + args.SetDefaultValues() + return args +} diff --git a/api/args/tiktok/query/query_suite_test.go b/api/args/tiktok/query/query_suite_test.go new file mode 100644 index 0000000..484865c --- /dev/null +++ b/api/args/tiktok/query/query_suite_test.go @@ -0,0 +1,13 @@ +package query_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/tiktok/query/query_test.go b/api/args/tiktok/query/query_test.go new file mode 100644 index 0000000..454c76f --- /dev/null +++ b/api/args/tiktok/query/query_test.go @@ -0,0 +1,205 @@ +package query_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/api/args/tiktok/query" + "github.com/masa-finance/tee-worker/api/types" +) + +var _ = Describe("TikTokQueryArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should unmarshal valid arguments with search", func() { + var args query.Arguments + jsonData := []byte(`{"type":"searchbyquery","search":["test query","another query"],"max_items":20}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Search).To(Equal([]string{"test query", "another query"})) + Expect(args.MaxItems).To(Equal(uint(20))) + }) + + It("should unmarshal valid arguments with start_urls", func() { + var args query.Arguments + jsonData := []byte(`{"type":"searchbyquery","start_urls":["https://tiktok.com/@user1","https://tiktok.com/@user2"],"max_items":15}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.StartUrls).To(Equal([]string{"https://tiktok.com/@user1", "https://tiktok.com/@user2"})) + Expect(args.MaxItems).To(Equal(uint(15))) + }) + + It("should unmarshal valid arguments with both search and start_urls", func() { + var args query.Arguments + jsonData := []byte(`{"type":"searchbyquery","search":["test"],"start_urls":["https://tiktok.com/@user"],"max_items":5}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Search).To(Equal([]string{"test"})) + Expect(args.StartUrls).To(Equal([]string{"https://tiktok.com/@user"})) + Expect(args.MaxItems).To(Equal(uint(5))) + }) + + It("should unmarshal valid arguments without max_items (should use default)", func() { + var args query.Arguments + jsonData := []byte(`{"type":"searchbyquery","search":["test query"]}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Search).To(Equal([]string{"test query"})) + Expect(args.MaxItems).To(Equal(uint(10))) // Default value + }) + + It("should fail unmarshal with invalid JSON", func() { + var args query.Arguments + jsonData := []byte(`{"type":"searchbyquery","search":["test query"`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + }) + + It("should fail unmarshal when neither search nor start_urls are provided", func() { + var args query.Arguments + jsonData := []byte(`{"type":"searchbyquery","max_items":10}`) + err := json.Unmarshal(jsonData, &args) + Expect(errors.Is(err, query.ErrSearchOrUrlsRequired)).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid search arguments", func() { + args := query.NewArguments() + args.Search = []string{"test query", "another query"} + args.MaxItems = 20 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should succeed with valid start_urls arguments", func() { + args := query.NewArguments() + args.StartUrls = []string{"https://tiktok.com/@user1", "https://tiktok.com/@user2"} + args.MaxItems = 15 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should succeed with both search and start_urls", func() { + args := query.NewArguments() + args.Search = []string{"test"} + args.StartUrls = []string{"https://tiktok.com/@user"} + args.MaxItems = 5 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when both search and start_urls are empty", func() { + args := query.NewArguments() + args.MaxItems = 10 + err := args.Validate() + Expect(errors.Is(err, query.ErrSearchOrUrlsRequired)).To(BeTrue()) + }) + + It("should fail when search is empty slice", func() { + args := query.NewArguments() + args.Search = []string{} + args.MaxItems = 10 + err := args.Validate() + Expect(errors.Is(err, query.ErrSearchOrUrlsRequired)).To(BeTrue()) + }) + + It("should fail when start_urls is empty slice", func() { + args := query.NewArguments() + args.StartUrls = []string{} + args.MaxItems = 10 + err := args.Validate() + Expect(errors.Is(err, query.ErrSearchOrUrlsRequired)).To(BeTrue()) + }) + }) + + Describe("Default values", func() { + It("should set default max_items when not provided", func() { + args := query.NewArguments() + args.Search = []string{"test"} + args.SetDefaultValues() + Expect(args.MaxItems).To(Equal(uint(10))) + }) + + It("should not override existing max_items", func() { + args := query.NewArguments() + args.Search = []string{"test"} + args.MaxItems = 25 + args.SetDefaultValues() + Expect(args.MaxItems).To(Equal(uint(25))) + }) + + It("should not override zero max_items if explicitly set", func() { + args := query.NewArguments() + args.Search = []string{"test"} + args.MaxItems = 0 + args.SetDefaultValues() + Expect(args.MaxItems).To(Equal(uint(10))) // Should set default + }) + }) + + Describe("Job capability", func() { + It("should return the searchbyquery capability", func() { + args := query.NewArguments() + Expect(args.GetCapability()).To(Equal(types.CapSearchByQuery)) + }) + + It("should validate capability for TiktokJob", func() { + args := query.NewArguments() + args.Search = []string{"test query"} + args.MaxItems = 10 + err := args.ValidateCapability(types.TiktokJob) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail validation for incompatible job type", func() { + args := query.NewArguments() + args.Search = []string{"test query"} + args.MaxItems = 10 + // Set a different capability first + args.Type = types.CapTranscription + err := args.ValidateCapability(types.TwitterJob) + Expect(err).To(HaveOccurred()) + // The capability should remain unchanged + Expect(args.Type).To(Equal(types.CapTranscription)) + }) + }) + + Describe("Edge cases", func() { + It("should handle empty search strings", func() { + args := query.NewArguments() + args.Search = []string{"", "valid query", ""} + args.MaxItems = 10 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should handle empty start_urls strings", func() { + args := query.NewArguments() + args.StartUrls = []string{"", "https://tiktok.com/@user", ""} + args.MaxItems = 10 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should handle large max_items values", func() { + args := query.NewArguments() + args.Search = []string{"test"} + args.MaxItems = 1000 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should handle end_page field", func() { + args := query.NewArguments() + args.Search = []string{"test"} + args.MaxItems = 10 + args.EndPage = 5 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + Expect(args.EndPage).To(Equal(uint(5))) + }) + }) +}) diff --git a/api/args/tiktok/tiktok.go b/api/args/tiktok/tiktok.go new file mode 100644 index 0000000..3779e44 --- /dev/null +++ b/api/args/tiktok/tiktok.go @@ -0,0 +1,11 @@ +package tiktok + +import ( + "github.com/masa-finance/tee-worker/api/args/tiktok/query" + "github.com/masa-finance/tee-worker/api/args/tiktok/transcription" + "github.com/masa-finance/tee-worker/api/args/tiktok/trending" +) + +type Transcription = transcription.Arguments +type Query = query.Arguments +type Trending = trending.Arguments diff --git a/api/args/tiktok/transcription/transcription.go b/api/args/tiktok/transcription/transcription.go new file mode 100644 index 0000000..03356fa --- /dev/null +++ b/api/args/tiktok/transcription/transcription.go @@ -0,0 +1,127 @@ +package transcription + +import ( + "encoding/json" + "errors" + "fmt" + "net/url" + "strings" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrVideoURLRequired = errors.New("video_url is required") + ErrInvalidVideoURL = errors.New("invalid video_url format") + ErrInvalidTikTokURL = errors.New("url must be a valid TikTok video URL") + ErrInvalidLanguageCode = errors.New("invalid language code") + ErrUnmarshalling = errors.New("failed to unmarshal TikTok transcription arguments") +) + +const ( + DefaultLanguage = "eng-US" +) + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +// Arguments defines args for TikTok transcriptions +type Arguments struct { + Type types.Capability `json:"type"` + VideoURL string `json:"video_url"` + Language string `json:"language,omitempty"` +} + +func (a *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(a)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + a.SetDefaultValues() + return a.Validate() +} + +func (a *Arguments) SetDefaultValues() { + if a.Language == "" { + a.Language = DefaultLanguage + } +} + +// Validate validates the TikTok arguments +func (t *Arguments) Validate() error { + err := t.ValidateCapability(types.TiktokJob) + if err != nil { + return err + } + if t.VideoURL == "" { + return ErrVideoURLRequired + } + + // Validate URL format + parsedURL, err := url.Parse(t.VideoURL) + if err != nil { + return fmt.Errorf("%w: %v", ErrInvalidVideoURL, err) + } + + // Basic TikTok URL validation + if !t.IsTikTokURL(parsedURL) { + return ErrInvalidTikTokURL + } + + // Validate language format if provided + if t.Language != "" { + if err := t.validateLanguageCode(); err != nil { + return err + } + } + + return nil +} + +func (t *Arguments) GetCapability() types.Capability { + return t.Type +} + +func (t *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&t.Type) +} + +// IsTikTokURL validates if the URL is a TikTok URL +func (t *Arguments) IsTikTokURL(parsedURL *url.URL) bool { + host := strings.ToLower(parsedURL.Host) + return host == "tiktok.com" || strings.HasSuffix(host, ".tiktok.com") +} + +// HasLanguagePreference returns true if a language preference is specified +func (t *Arguments) HasLanguagePreference() bool { + return t.Language != "" +} + +// GetVideoURL returns the source video URL +func (t *Arguments) GetVideoURL() string { + return t.VideoURL +} + +// GetLanguageCode returns the language code, defaulting to "en-us" if not specified +func (t *Arguments) GetLanguageCode() string { + return t.Language +} + +// validateLanguageCode validates the language code format +func (t *Arguments) validateLanguageCode() error { + parts := strings.Split(t.Language, "-") + if len(parts) != 2 || (len(parts[0]) != 2 && len(parts[0]) != 3) || len(parts[1]) != 2 { + return fmt.Errorf("%w: %s", ErrInvalidLanguageCode, t.Language) + } + return nil +} + +func NewArguments() Arguments { + args := Arguments{ + Type: types.CapTranscription, + } + args.SetDefaultValues() + return args +} diff --git a/api/args/tiktok/transcription/transcription_suite_test.go b/api/args/tiktok/transcription/transcription_suite_test.go new file mode 100644 index 0000000..a9d7709 --- /dev/null +++ b/api/args/tiktok/transcription/transcription_suite_test.go @@ -0,0 +1,13 @@ +package transcription_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/tiktok/transcription/transcription_test.go b/api/args/tiktok/transcription/transcription_test.go new file mode 100644 index 0000000..50f5d6e --- /dev/null +++ b/api/args/tiktok/transcription/transcription_test.go @@ -0,0 +1,242 @@ +package transcription_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/api/args/tiktok/transcription" + "github.com/masa-finance/tee-worker/api/types" +) + +var _ = Describe("TikTokTranscriptionArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should unmarshal valid arguments", func() { + var args transcription.Arguments + jsonData := []byte(`{"type":"transcription","video_url":"https://tiktok.com/@user/video/123","language":"en-us"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.VideoURL).To(Equal("https://tiktok.com/@user/video/123")) + Expect(args.Language).To(Equal("en-us")) + }) + + It("should unmarshal valid arguments without language", func() { + var args transcription.Arguments + jsonData := []byte(`{"type":"transcription","video_url":"https://tiktok.com/@user/video/123"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.VideoURL).To(Equal("https://tiktok.com/@user/video/123")) + Expect(args.Language).To(Equal("eng-US")) // Default language should be set + }) + + It("should fail unmarshal with invalid JSON", func() { + var args transcription.Arguments + jsonData := []byte(`{"type":"transcription","video_url":"https://tiktok.com/@user/video/123"`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + }) + + It("should fail unmarshal when video_url is missing", func() { + var args transcription.Arguments + jsonData := []byte(`{"type":"transcription","language":"en-us"}`) + err := json.Unmarshal(jsonData, &args) + Expect(errors.Is(err, transcription.ErrVideoURLRequired)).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "en-us" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should succeed with valid arguments without language", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when video_url is missing", func() { + args := transcription.NewArguments() + args.Language = "en-us" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrVideoURLRequired)).To(BeTrue()) + }) + + It("should fail with an invalid URL format", func() { + args := transcription.NewArguments() + args.VideoURL = "not-a-url" + args.Language = "en-us" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidTikTokURL)).To(BeTrue()) + }) + + It("should fail with non-TikTok URL", func() { + args := transcription.NewArguments() + args.VideoURL = "https://youtube.com/watch?v=123" + args.Language = "en-us" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidTikTokURL)).To(BeTrue()) + }) + + It("should fail with invalid language code format", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "invalid" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidLanguageCode)).To(BeTrue()) + }) + }) + + Describe("TikTok URL validation", func() { + It("should accept tiktok.com URLs", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should accept www.tiktok.com URLs", func() { + args := transcription.NewArguments() + args.VideoURL = "https://www.tiktok.com/@user/video/123" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should accept m.tiktok.com URLs", func() { + args := transcription.NewArguments() + args.VideoURL = "https://m.tiktok.com/@user/video/123" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should reject non-TikTok URLs", func() { + args := transcription.NewArguments() + args.VideoURL = "https://youtube.com/watch?v=123" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidTikTokURL)).To(BeTrue()) + }) + }) + + Describe("Language code validation", func() { + It("should accept valid 2-letter language codes", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "en-us" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should accept valid 3-letter language codes", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "eng-us" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should accept mixed case language codes", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "EN-US" + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should reject invalid language format", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "english" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidLanguageCode)).To(BeTrue()) + }) + + It("should reject too many parts", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "en-us-extra" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidLanguageCode)).To(BeTrue()) + }) + + It("should reject too few parts", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "en" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidLanguageCode)).To(BeTrue()) + }) + + It("should reject invalid region length", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "en-usa" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidLanguageCode)).To(BeTrue()) + }) + + It("should reject invalid language length", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "english-us" + err := args.Validate() + Expect(errors.Is(err, transcription.ErrInvalidLanguageCode)).To(BeTrue()) + }) + }) + + Describe("Job capability", func() { + It("should return the transcription capability", func() { + args := transcription.NewArguments() + Expect(args.GetCapability()).To(Equal(types.CapTranscription)) + }) + + It("should validate capability for TiktokJob", func() { + args := transcription.NewArguments() + args.VideoURL = "https://tiktok.com/@user/video/123" + args.Language = "en-us" + err := args.ValidateCapability(types.TiktokJob) + Expect(err).ToNot(HaveOccurred()) + }) + }) + + Describe("Helper methods", func() { + It("should return true when language preference is set", func() { + args := transcription.NewArguments() + args.Language = "en-us" + Expect(args.HasLanguagePreference()).To(BeTrue()) + }) + + It("should return false when language preference is not set", func() { + args := transcription.NewArguments() + args.Language = "" + Expect(args.HasLanguagePreference()).To(BeFalse()) + }) + + It("should return the language code when set", func() { + args := transcription.NewArguments() + args.Language = "en-us" + Expect(args.GetLanguageCode()).To(Equal("en-us")) + }) + + It("should return default language code when not set", func() { + args := transcription.NewArguments() + args.Language = "" + args.SetDefaultValues() + Expect(args.GetLanguageCode()).To(Equal("eng-US")) + }) + + It("should return the video URL", func() { + expected := "https://tiktok.com/@user/video/123" + args := transcription.NewArguments() + args.VideoURL = expected + Expect(args.GetVideoURL()).To(Equal(expected)) + }) + }) +}) diff --git a/api/args/tiktok/trending/trending.go b/api/args/tiktok/trending/trending.go new file mode 100644 index 0000000..00435e7 --- /dev/null +++ b/api/args/tiktok/trending/trending.go @@ -0,0 +1,122 @@ +package trending + +import ( + "encoding/json" + "errors" + "fmt" + "strings" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrTrendingCountryCodeRequired = errors.New("country_code is required") + ErrTrendingSortByRequired = errors.New("sort_by is required") + ErrTrendingPeriodRequired = errors.New("period is required") + ErrTrendingMaxItemsNegative = errors.New("max_items must be non-negative") + ErrUnmarshalling = errors.New("failed to unmarshal TikTok searchbytrending arguments") +) + +// Period constants for TikTok trending search +const ( + periodWeek string = "7" + periodMonth string = "30" +) + +const ( + sortTrending string = "vv" + sortLike string = "like" + sortComment string = "comment" + sortRepost string = "repost" +) + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +// Arguments defines args for lexis-solutions/tiktok-trending-videos-scraper +type Arguments struct { + Type types.Capability `json:"type"` + CountryCode string `json:"country_code,omitempty"` + SortBy string `json:"sort_by,omitempty"` + MaxItems int `json:"max_items,omitempty"` + Period string `json:"period,omitempty"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +func (a *Arguments) SetDefaultValues() { + if a.CountryCode == "" { + a.CountryCode = "US" + } + if a.SortBy == "" { + a.SortBy = sortTrending + } + if a.Period == "" { + a.Period = periodWeek + } +} + +func (t *Arguments) GetCapability() types.Capability { + return t.Type +} + +func (t *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&t.Type) +} + +func (t *Arguments) Validate() error { + err := t.ValidateCapability(types.TiktokJob) + if err != nil { + return err + } + allowedSorts := map[string]struct{}{ + sortTrending: {}, sortLike: {}, sortComment: {}, sortRepost: {}, + } + + allowedPeriods := map[string]struct{}{ + periodWeek: {}, + periodMonth: {}, + } + + allowedCountries := map[string]struct{}{ + "AU": {}, "BR": {}, "CA": {}, "EG": {}, "FR": {}, "DE": {}, "ID": {}, "IL": {}, "IT": {}, "JP": {}, + "MY": {}, "PH": {}, "RU": {}, "SA": {}, "SG": {}, "KR": {}, "ES": {}, "TW": {}, "TH": {}, "TR": {}, + "AE": {}, "GB": {}, "US": {}, "VN": {}, + } + + if _, ok := allowedCountries[strings.ToUpper(t.CountryCode)]; !ok { + return fmt.Errorf("%w: '%s'", ErrTrendingCountryCodeRequired, t.CountryCode) + } + if _, ok := allowedSorts[strings.ToLower(t.SortBy)]; !ok { + return fmt.Errorf("%w: '%s'", ErrTrendingSortByRequired, t.SortBy) + } + if _, ok := allowedPeriods[t.Period]; !ok { + // Extract keys for error message + var validKeys []string + for key := range allowedPeriods { + validKeys = append(validKeys, key) + } + return fmt.Errorf("%w: '%s' (allowed: %s)", ErrTrendingPeriodRequired, t.Period, strings.Join(validKeys, ", ")) + } + if t.MaxItems < 0 { + return fmt.Errorf("%w, got: %d", ErrTrendingMaxItemsNegative, t.MaxItems) + } + return nil +} + +func NewArguments() Arguments { + args := Arguments{ + Type: types.CapSearchByTrending, + } + args.SetDefaultValues() + return args +} diff --git a/api/args/tiktok/trending/trending_suite_test.go b/api/args/tiktok/trending/trending_suite_test.go new file mode 100644 index 0000000..29eee1e --- /dev/null +++ b/api/args/tiktok/trending/trending_suite_test.go @@ -0,0 +1,13 @@ +package trending_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/tiktok/trending/trending_test.go b/api/args/tiktok/trending/trending_test.go new file mode 100644 index 0000000..cf95b07 --- /dev/null +++ b/api/args/tiktok/trending/trending_test.go @@ -0,0 +1,422 @@ +package trending_test + +import ( + "encoding/json" + "errors" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/api/args/tiktok/trending" + "github.com/masa-finance/tee-worker/api/types" +) + +var _ = Describe("TikTokTrendingArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should unmarshal valid arguments with all fields", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending","country_code":"US","sort_by":"vv","max_items":50,"period":"7"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Type).To(Equal(types.CapSearchByTrending)) + Expect(args.CountryCode).To(Equal("US")) + Expect(args.SortBy).To(Equal("vv")) + Expect(args.MaxItems).To(Equal(50)) + Expect(args.Period).To(Equal("7")) + }) + + It("should unmarshal valid arguments with minimal fields", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Type).To(Equal(types.CapSearchByTrending)) + Expect(args.CountryCode).To(Equal("US")) // Default + Expect(args.SortBy).To(Equal("vv")) // Default + Expect(args.Period).To(Equal("7")) // Default + Expect(args.MaxItems).To(Equal(0)) // No default for MaxItems + }) + + It("should fail unmarshal with invalid JSON", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending","country_code":"US"`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + }) + + It("should fail unmarshal with invalid country code", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending","country_code":"INVALID"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + Expect(strings.Contains(err.Error(), "country_code is required")).To(BeTrue()) + }) + + It("should fail unmarshal with invalid sort_by", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending","sort_by":"invalid"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + Expect(strings.Contains(err.Error(), "sort_by is required")).To(BeTrue()) + }) + + It("should fail unmarshal with invalid period", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending","period":"invalid"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + Expect(strings.Contains(err.Error(), "period is required")).To(BeTrue()) + }) + + It("should fail unmarshal with negative max_items", func() { + var args trending.Arguments + jsonData := []byte(`{"type":"searchbytrending","max_items":-1}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + Expect(strings.Contains(err.Error(), "max_items must be non-negative")).To(BeTrue()) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 50, + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail with invalid country code", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "INVALID", + SortBy: "vv", + MaxItems: 50, + Period: "7", + } + err := args.Validate() + Expect(errors.Is(err, trending.ErrTrendingCountryCodeRequired)).To(BeTrue()) + }) + + It("should fail with invalid sort_by", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "invalid", + MaxItems: 50, + Period: "7", + } + err := args.Validate() + Expect(errors.Is(err, trending.ErrTrendingSortByRequired)).To(BeTrue()) + }) + + It("should fail with invalid period", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 50, + Period: "invalid", + } + err := args.Validate() + Expect(errors.Is(err, trending.ErrTrendingPeriodRequired)).To(BeTrue()) + }) + + It("should fail with negative max_items", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: -1, + Period: "7", + } + err := args.Validate() + Expect(errors.Is(err, trending.ErrTrendingMaxItemsNegative)).To(BeTrue()) + }) + }) + + Describe("Default values", func() { + It("should set default country_code when not provided", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + SortBy: "vv", + Period: "7", + } + args.SetDefaultValues() + Expect(args.CountryCode).To(Equal("US")) + }) + + It("should set default sort_by when not provided", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + Period: "7", + } + args.SetDefaultValues() + Expect(args.SortBy).To(Equal("vv")) + }) + + It("should set default period when not provided", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + } + args.SetDefaultValues() + Expect(args.Period).To(Equal("7")) + }) + + It("should not override existing values", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "CA", + SortBy: "like", + Period: "30", + } + args.SetDefaultValues() + Expect(args.CountryCode).To(Equal("CA")) + Expect(args.SortBy).To(Equal("like")) + Expect(args.Period).To(Equal("30")) + }) + }) + + Describe("Country code validation", func() { + It("should accept valid country codes", func() { + validCountries := []string{"US", "CA", "GB", "AU", "DE", "FR", "JP", "KR", "BR"} + for _, country := range validCountries { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: country, + SortBy: "vv", + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred(), "Country %s should be valid", country) + } + }) + + It("should accept lowercase country codes", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "us", + SortBy: "vv", + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should reject invalid country codes", func() { + invalidCountries := []string{"INVALID", "XX", "123", ""} + for _, country := range invalidCountries { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: country, + SortBy: "vv", + Period: "7", + } + err := args.Validate() + Expect(err).To(HaveOccurred(), "Country %s should be invalid", country) + } + }) + }) + + Describe("Sort by validation", func() { + It("should accept valid sort options", func() { + validSorts := []string{"vv", "like", "comment", "repost"} + for _, sort := range validSorts { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: sort, + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred(), "Sort %s should be valid", sort) + } + }) + + It("should accept uppercase sort options", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "LIKE", + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should reject invalid sort options", func() { + invalidSorts := []string{"invalid", "views", "likes", ""} + for _, sort := range invalidSorts { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: sort, + Period: "7", + } + err := args.Validate() + Expect(err).To(HaveOccurred(), "Sort %s should be invalid", sort) + } + }) + }) + + Describe("Period validation", func() { + It("should accept valid periods", func() { + validPeriods := []string{"7", "30"} + for _, period := range validPeriods { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + Period: period, + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred(), "Period %s should be valid", period) + } + }) + + It("should reject invalid periods", func() { + invalidPeriods := []string{"1", "14", "60", "invalid", ""} + for _, period := range invalidPeriods { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + Period: period, + } + err := args.Validate() + Expect(err).To(HaveOccurred(), "Period %s should be invalid", period) + } + }) + }) + + Describe("MaxItems validation", func() { + It("should accept zero max_items", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 0, + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should accept positive max_items", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 100, + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should reject negative max_items", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: -1, + Period: "7", + } + err := args.Validate() + Expect(errors.Is(err, trending.ErrTrendingMaxItemsNegative)).To(BeTrue()) + }) + }) + + Describe("Job capability", func() { + It("should return the searchbytrending capability", func() { + args := trending.NewArguments() + Expect(args.GetCapability()).To(Equal(types.CapSearchByTrending)) + }) + + It("should validate capability for TiktokJob", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 50, + Period: "7", + } + err := args.ValidateCapability(types.TiktokJob) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail validation for incompatible job type", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 50, + Period: "7", + } + err := args.ValidateCapability(types.TwitterJob) + Expect(err).To(HaveOccurred()) + }) + }) + + Describe("Edge cases", func() { + It("should handle mixed case country codes", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "us", + SortBy: "vv", + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should handle mixed case sort options", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "LIKE", + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should handle large max_items values", func() { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: "US", + SortBy: "vv", + MaxItems: 10000, + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should handle all supported countries", func() { + supportedCountries := []string{ + "AU", "BR", "CA", "EG", "FR", "DE", "ID", "IL", "IT", "JP", + "MY", "PH", "RU", "SA", "SG", "KR", "ES", "TW", "TH", "TR", + "AE", "GB", "US", "VN", + } + for _, country := range supportedCountries { + args := &trending.Arguments{ + Type: types.CapSearchByTrending, + CountryCode: country, + SortBy: "vv", + Period: "7", + } + err := args.Validate() + Expect(err).ToNot(HaveOccurred(), "Country %s should be supported", country) + } + }) + }) +}) diff --git a/api/args/twitter.go b/api/args/twitter.go deleted file mode 100644 index 467df1b..0000000 --- a/api/args/twitter.go +++ /dev/null @@ -1,121 +0,0 @@ -package args - -import ( - "encoding/json" - "errors" - "fmt" - - teetypes "github.com/masa-finance/tee-worker/api/types" -) - -var ( - ErrTwitterCountNegative = errors.New("count must be non-negative") - ErrTwitterCountTooLarge = errors.New("count must be less than or equal to 1000") - ErrTwitterMaxResultsTooLarge = errors.New("max_results must be less than or equal to 1000") - ErrTwitterMaxResultsNegative = errors.New("max_results must be non-negative") -) - -const ( - TwitterMaxResults = 1000 -) - -// TwitterSearchArguments defines args for Twitter searches -type TwitterSearchArguments struct { - QueryType teetypes.Capability `json:"type"` // Optional, type of search - Query string `json:"query"` // Username or search query - Count int `json:"count"` - StartTime string `json:"start_time"` // Optional ISO timestamp - EndTime string `json:"end_time"` // Optional ISO timestamp - MaxResults int `json:"max_results"` // Optional, max number of results - NextCursor string `json:"next_cursor"` -} - -// UnmarshalJSON implements custom JSON unmarshaling with validation -func (t *TwitterSearchArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias TwitterSearchArguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(t), - } - - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal Twitter arguments: %w", err) - } - - return t.Validate() -} - -// Validate validates the Twitter arguments (general validation) -func (t *TwitterSearchArguments) Validate() error { - // note, query is not required for all capabilities - if t.Count < 0 { - return fmt.Errorf("%w, got: %d", ErrTwitterCountNegative, t.Count) - } - if t.Count > TwitterMaxResults { - return fmt.Errorf("%w, got: %d", ErrTwitterCountTooLarge, t.Count) - } - if t.MaxResults < 0 { - return fmt.Errorf("%w, got: %d", ErrTwitterMaxResultsNegative, t.MaxResults) - } - if t.MaxResults > TwitterMaxResults { - return fmt.Errorf("%w, got: %d", ErrTwitterMaxResultsTooLarge, t.MaxResults) - } - - return nil -} - -// ValidateForJobType validates Twitter arguments for a specific job type -func (t *TwitterSearchArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := t.Validate(); err != nil { - return err - } - - // Validate QueryType against job-specific capabilities - return jobType.ValidateCapability(teetypes.Capability(t.QueryType)) -} - -// GetCapability returns the QueryType as a typed Capability -func (t *TwitterSearchArguments) GetCapability() teetypes.Capability { - return teetypes.Capability(t.QueryType) -} - -func (t *TwitterSearchArguments) IsSingleTweetOperation() bool { - capability := t.GetCapability() - return capability == teetypes.CapGetById -} - -func (t *TwitterSearchArguments) IsMultipleTweetOperation() bool { - capability := t.GetCapability() - return capability == teetypes.CapSearchByQuery || - capability == teetypes.CapSearchByFullArchive || - capability == teetypes.CapGetHomeTweets || - capability == teetypes.CapGetForYouTweets || - capability == teetypes.CapGetTweets || - capability == teetypes.CapGetReplies || - capability == teetypes.CapGetMedia -} - -func (t *TwitterSearchArguments) IsSingleProfileOperation() bool { - capability := t.GetCapability() - return capability == teetypes.CapGetProfileById || - capability == teetypes.CapSearchByProfile -} - -func (t *TwitterSearchArguments) IsMultipleProfileOperation() bool { - capability := t.GetCapability() - return capability == teetypes.CapGetFollowing || - capability == teetypes.CapGetFollowers || - capability == teetypes.CapGetRetweeters -} - -func (t *TwitterSearchArguments) IsSingleSpaceOperation() bool { - capability := t.GetCapability() - return capability == teetypes.CapGetSpace -} - -func (t *TwitterSearchArguments) IsTrendsOperation() bool { - capability := t.GetCapability() - return capability == teetypes.CapGetTrends -} diff --git a/api/args/twitter/search/search.go b/api/args/twitter/search/search.go new file mode 100644 index 0000000..ef9643d --- /dev/null +++ b/api/args/twitter/search/search.go @@ -0,0 +1,125 @@ +package search + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrCountNegative = errors.New("count must be non-negative") + ErrCountTooLarge = errors.New("count must be less than or equal to 1000") + ErrMaxResultsTooLarge = errors.New("max_results must be less than or equal to 1000") + ErrMaxResultsNegative = errors.New("max_results must be non-negative") + ErrUnmarshalling = errors.New("failed to unmarshal twitter search arguments") +) + +const ( + MaxResults = 1000 +) + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +// Arguments defines args for Twitter searches +type Arguments struct { + Type types.Capability `json:"type"` + Query string `json:"query"` // Username or search query + Count int `json:"count"` + StartTime string `json:"start_time"` // Optional ISO timestamp + EndTime string `json:"end_time"` // Optional ISO timestamp + MaxResults int `json:"max_results"` // Optional, max number of results + NextCursor string `json:"next_cursor"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +// SetDefaultValues sets default values for the arguments +func (t *Arguments) SetDefaultValues() { + if t.MaxResults == 0 { + t.MaxResults = MaxResults + } +} + +// Validate validates the arguments (general validation) +func (t *Arguments) Validate() error { + // note, query is not required for all capabilities + err := t.ValidateCapability(types.TwitterJob) + if err != nil { + return err + } + if t.Count < 0 { + return fmt.Errorf("%w, got: %d", ErrCountNegative, t.Count) + } + if t.Count > MaxResults { + return fmt.Errorf("%w, got: %d", ErrCountTooLarge, t.Count) + } + if t.MaxResults < 0 { + return fmt.Errorf("%w, got: %d", ErrMaxResultsNegative, t.MaxResults) + } + if t.MaxResults > MaxResults { + return fmt.Errorf("%w, got: %d", ErrMaxResultsTooLarge, t.MaxResults) + } + + return nil +} + +func (t *Arguments) GetCapability() types.Capability { + return t.Type +} + +func (t *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&t.Type) +} + +func (t *Arguments) IsSingleTweetOperation() bool { + return t.GetCapability() == types.CapGetById +} + +func (t *Arguments) IsMultipleTweetOperation() bool { + c := t.GetCapability() + return c == types.CapSearchByQuery || + c == types.CapSearchByFullArchive || + c == types.CapGetTweets || + c == types.CapGetReplies || + c == types.CapGetMedia +} + +func (t *Arguments) IsSingleProfileOperation() bool { + c := t.GetCapability() + return c == types.CapGetProfileById || + c == types.CapSearchByProfile +} + +func (t *Arguments) IsMultipleProfileOperation() bool { + c := t.GetCapability() + return c == types.CapGetFollowing || + c == types.CapGetFollowers || + c == types.CapGetRetweeters +} + +func (t *Arguments) IsSingleSpaceOperation() bool { + return t.GetCapability() == types.CapGetSpace +} + +func (t *Arguments) IsTrendsOperation() bool { + return t.GetCapability() == types.CapGetTrends +} + +func NewArguments() Arguments { + args := Arguments{} + args.SetDefaultValues() + args.Validate() // This will set the default capability via ValidateCapability + return args +} diff --git a/api/args/twitter/search/search_suite_test.go b/api/args/twitter/search/search_suite_test.go new file mode 100644 index 0000000..688d8ec --- /dev/null +++ b/api/args/twitter/search/search_suite_test.go @@ -0,0 +1,13 @@ +package search_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestArgs(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Args Suite") +} diff --git a/api/args/twitter/search/search_test.go b/api/args/twitter/search/search_test.go new file mode 100644 index 0000000..2be7f9b --- /dev/null +++ b/api/args/twitter/search/search_test.go @@ -0,0 +1,297 @@ +package search_test + +import ( + "encoding/json" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/masa-finance/tee-worker/api/args/twitter/search" + "github.com/masa-finance/tee-worker/api/types" +) + +var _ = Describe("TwitterSearchArguments", func() { + Describe("Marshalling and unmarshalling", func() { + It("should unmarshal valid arguments with all fields", func() { + var args search.Arguments + jsonData := []byte(`{ + "type": "searchbyquery", + "query": "test query", + "count": 50, + "start_time": "2023-01-01T00:00:00Z", + "end_time": "2023-12-31T23:59:59Z", + "max_results": 100, + "next_cursor": "cursor123" + }`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Query).To(Equal("test query")) + Expect(args.Count).To(Equal(50)) + Expect(args.StartTime).To(Equal("2023-01-01T00:00:00Z")) + Expect(args.EndTime).To(Equal("2023-12-31T23:59:59Z")) + Expect(args.MaxResults).To(Equal(100)) + Expect(args.NextCursor).To(Equal("cursor123")) + }) + + It("should unmarshal valid arguments with minimal fields", func() { + var args search.Arguments + jsonData := []byte(`{ + "type": "searchbyquery", + "query": "minimal test" + }`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + Expect(args.Query).To(Equal("minimal test")) + Expect(args.Count).To(Equal(0)) + Expect(args.MaxResults).To(Equal(1000)) // SetDefaultValues() sets this to MaxResults + }) + + It("should fail unmarshal with invalid JSON", func() { + var args search.Arguments + jsonData := []byte(`{"type":"searchbyquery","query":"test"`) + err := json.Unmarshal(jsonData, &args) + Expect(err).To(HaveOccurred()) + // The error is a JSON syntax error, not wrapped with ErrUnmarshalling + // since the JSON is malformed before reaching the custom UnmarshalJSON method + }) + + It("should set default values after unmarshalling", func() { + var args search.Arguments + jsonData := []byte(`{"type":"searchbyquery","query":"test"}`) + err := json.Unmarshal(jsonData, &args) + Expect(err).ToNot(HaveOccurred()) + // Default values should be set by SetDefaultValues() + Expect(args.GetCapability()).To(Equal(types.CapSearchByQuery)) + }) + }) + + Describe("Validation", func() { + It("should succeed with valid arguments", func() { + args := search.NewArguments() + args.Query = "test query" + args.Count = 50 + args.MaxResults = 100 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should fail when count is negative", func() { + args := search.NewArguments() + args.Query = "test query" + args.Count = -1 + err := args.Validate() + Expect(errors.Is(err, search.ErrCountNegative)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got: -1")) + }) + + It("should fail when count exceeds maximum", func() { + args := search.NewArguments() + args.Query = "test query" + args.Count = 1001 + err := args.Validate() + Expect(errors.Is(err, search.ErrCountTooLarge)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got: 1001")) + }) + + It("should fail when max_results is negative", func() { + args := search.NewArguments() + args.Query = "test query" + args.MaxResults = -1 + err := args.Validate() + Expect(errors.Is(err, search.ErrMaxResultsNegative)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got: -1")) + }) + + It("should fail when max_results exceeds maximum", func() { + args := search.NewArguments() + args.Query = "test query" + args.MaxResults = 1001 + err := args.Validate() + Expect(errors.Is(err, search.ErrMaxResultsTooLarge)).To(BeTrue()) + Expect(err.Error()).To(ContainSubstring("got: 1001")) + }) + + It("should succeed with count at maximum boundary", func() { + args := search.NewArguments() + args.Query = "test query" + args.Count = 1000 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + + It("should succeed with max_results at maximum boundary", func() { + args := search.NewArguments() + args.Query = "test query" + args.MaxResults = 1000 + err := args.Validate() + Expect(err).ToNot(HaveOccurred()) + }) + }) + + Describe("Operation Type Detection", func() { + Context("Single Tweet Operations", func() { + It("should identify getbyid as single tweet operation", func() { + args := search.NewArguments() + args.Type = types.CapGetById + Expect(args.IsSingleTweetOperation()).To(BeTrue()) + }) + + It("should not identify searchbyquery as single tweet operation", func() { + args := search.NewArguments() + // Type is already CapSearchByQuery from NewArguments() + Expect(args.IsSingleTweetOperation()).To(BeFalse()) + }) + }) + + Context("Multiple Tweet Operations", func() { + It("should identify searchbyquery as multiple tweet operation", func() { + args := search.NewArguments() + // Type is already CapSearchByQuery from NewArguments() + Expect(args.IsMultipleTweetOperation()).To(BeTrue()) + }) + + It("should identify searchbyfullarchive as multiple tweet operation", func() { + args := search.NewArguments() + args.Type = types.CapSearchByFullArchive + Expect(args.IsMultipleTweetOperation()).To(BeTrue()) + }) + + It("should identify gettweets as multiple tweet operation", func() { + args := search.NewArguments() + args.Type = types.CapGetTweets + Expect(args.IsMultipleTweetOperation()).To(BeTrue()) + }) + + It("should identify getreplies as multiple tweet operation", func() { + args := search.NewArguments() + args.Type = types.CapGetReplies + Expect(args.IsMultipleTweetOperation()).To(BeTrue()) + }) + + It("should identify getmedia as multiple tweet operation", func() { + args := search.NewArguments() + args.Type = types.CapGetMedia + Expect(args.IsMultipleTweetOperation()).To(BeTrue()) + }) + + It("should not identify getbyid as multiple tweet operation", func() { + args := search.NewArguments() + args.Type = types.CapGetById + Expect(args.IsMultipleTweetOperation()).To(BeFalse()) + }) + }) + + Context("Single Profile Operations", func() { + It("should identify getprofilebyid as single profile operation", func() { + args := search.NewArguments() + args.Type = types.CapGetProfileById + Expect(args.IsSingleProfileOperation()).To(BeTrue()) + }) + + It("should identify searchbyprofile as single profile operation", func() { + args := search.NewArguments() + args.Type = types.CapSearchByProfile + Expect(args.IsSingleProfileOperation()).To(BeTrue()) + }) + + It("should not identify getfollowers as single profile operation", func() { + args := search.NewArguments() + args.Type = types.CapGetFollowers + Expect(args.IsSingleProfileOperation()).To(BeFalse()) + }) + }) + + Context("Multiple Profile Operations", func() { + It("should identify getfollowing as multiple profile operation", func() { + args := search.NewArguments() + args.Type = types.CapGetFollowing + Expect(args.IsMultipleProfileOperation()).To(BeTrue()) + }) + + It("should identify getfollowers as multiple profile operation", func() { + args := search.NewArguments() + args.Type = types.CapGetFollowers + Expect(args.IsMultipleProfileOperation()).To(BeTrue()) + }) + + It("should identify getretweeters as multiple profile operation", func() { + args := search.NewArguments() + args.Type = types.CapGetRetweeters + Expect(args.IsMultipleProfileOperation()).To(BeTrue()) + }) + + It("should not identify getprofilebyid as multiple profile operation", func() { + args := search.NewArguments() + args.Type = types.CapGetProfileById + Expect(args.IsMultipleProfileOperation()).To(BeFalse()) + }) + }) + + Context("Single Space Operations", func() { + It("should identify getspace as single space operation", func() { + args := search.NewArguments() + args.Type = types.CapGetSpace + Expect(args.IsSingleSpaceOperation()).To(BeTrue()) + }) + + It("should not identify searchbyquery as single space operation", func() { + args := search.NewArguments() + // Type is already CapSearchByQuery from NewArguments() + Expect(args.IsSingleSpaceOperation()).To(BeFalse()) + }) + }) + + Context("Trends Operations", func() { + It("should identify gettrends as trends operation", func() { + args := search.NewArguments() + args.Type = types.CapGetTrends + Expect(args.IsTrendsOperation()).To(BeTrue()) + }) + + It("should not identify searchbyquery as trends operation", func() { + args := search.NewArguments() + // Type is already CapSearchByQuery from NewArguments() + Expect(args.IsTrendsOperation()).To(BeFalse()) + }) + }) + }) + + Describe("Constants and Error Values", func() { + It("should have correct MaxResults constant", func() { + Expect(search.MaxResults).To(Equal(1000)) + }) + + It("should have correct error messages", func() { + Expect(search.ErrCountNegative.Error()).To(Equal("count must be non-negative")) + Expect(search.ErrCountTooLarge.Error()).To(Equal("count must be less than or equal to 1000")) + Expect(search.ErrMaxResultsTooLarge.Error()).To(Equal("max_results must be less than or equal to 1000")) + Expect(search.ErrMaxResultsNegative.Error()).To(Equal("max_results must be non-negative")) + Expect(search.ErrUnmarshalling.Error()).To(Equal("failed to unmarshal twitter search arguments")) + }) + }) + + Describe("JSON Marshalling", func() { + It("should marshal arguments correctly", func() { + args := search.NewArguments() + args.Query = "test query" + args.Count = 50 + args.StartTime = "2023-01-01T00:00:00Z" + args.EndTime = "2023-12-31T23:59:59Z" + args.MaxResults = 100 + args.NextCursor = "cursor123" + jsonData, err := json.Marshal(args) + Expect(err).ToNot(HaveOccurred()) + + var unmarshalled search.Arguments + err = json.Unmarshal(jsonData, &unmarshalled) + Expect(err).ToNot(HaveOccurred()) + Expect(unmarshalled.Query).To(Equal(args.Query)) + Expect(unmarshalled.Count).To(Equal(args.Count)) + Expect(unmarshalled.StartTime).To(Equal(args.StartTime)) + Expect(unmarshalled.EndTime).To(Equal(args.EndTime)) + Expect(unmarshalled.MaxResults).To(Equal(args.MaxResults)) + Expect(unmarshalled.NextCursor).To(Equal(args.NextCursor)) + }) + }) +}) diff --git a/api/args/twitter/twitter.go b/api/args/twitter/twitter.go new file mode 100644 index 0000000..819c7da --- /dev/null +++ b/api/args/twitter/twitter.go @@ -0,0 +1,7 @@ +package twitter + +import ( + "github.com/masa-finance/tee-worker/api/args/twitter/search" +) + +type Search = search.Arguments diff --git a/api/args/unmarshaller.go b/api/args/unmarshaller.go index 737826e..a8bf65f 100644 --- a/api/args/unmarshaller.go +++ b/api/args/unmarshaller.go @@ -2,19 +2,31 @@ package args import ( "encoding/json" + "errors" "fmt" + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/args/linkedin" + "github.com/masa-finance/tee-worker/api/args/reddit" + "github.com/masa-finance/tee-worker/api/args/telemetry" + "github.com/masa-finance/tee-worker/api/args/tiktok" + "github.com/masa-finance/tee-worker/api/args/twitter" + "github.com/masa-finance/tee-worker/api/args/web" "github.com/masa-finance/tee-worker/api/types" ) -// JobArguments defines the interface that all job arguments must implement -type JobArguments interface { - GetCapability() types.Capability -} +var ( + ErrUnknownJobType = errors.New("unknown job type") + ErrUnknownCapability = errors.New("unknown capability") + ErrFailedToUnmarshal = errors.New("failed to unmarshal job arguments") + ErrFailedToMarshal = errors.New("failed to marshal job arguments") +) + +type Args = map[string]any // UnmarshalJobArguments unmarshals job arguments from a generic map into the appropriate typed struct -// This works with both tee-indexer and tee-worker JobArguments types -func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { +// This works with both tee-indexer and tee-worker JobArgument types +func UnmarshalJobArguments(jobType types.JobType, args Args) (base.JobArgument, error) { switch jobType { case types.WebJob: return unmarshalWebArguments(args) @@ -22,159 +34,104 @@ func UnmarshalJobArguments(jobType types.JobType, args map[string]any) (JobArgum case types.TiktokJob: return unmarshalTikTokArguments(args) - case types.TwitterJob, types.TwitterCredentialJob, types.TwitterApiJob, types.TwitterApifyJob: - return unmarshalTwitterArguments(jobType, args) + case types.TwitterJob: + return unmarshalTwitterArguments(args) case types.LinkedInJob: - return unmarshalLinkedInArguments(jobType, args) + return unmarshalLinkedInArguments(args) case types.RedditJob: - return unmarshalRedditArguments(jobType, args) + return unmarshalRedditArguments(args) case types.TelemetryJob: - return &TelemetryJobArguments{}, nil + return unmarshalTelemetryArguments(args) default: - return nil, fmt.Errorf("unknown job type: %s", jobType) + return nil, fmt.Errorf("%w: %s", ErrUnknownJobType, jobType) } } // Helper functions for unmarshaling specific argument types -func unmarshalWebArguments(args map[string]any) (*WebArguments, error) { - webArgs := &WebArguments{} +func unmarshalWebArguments(args Args) (*web.Page, error) { + webArgs := &web.Page{} if err := unmarshalToStruct(args, webArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal web job arguments: %w", err) + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } return webArgs, nil } -func unmarshalTikTokArguments(args map[string]any) (JobArguments, error) { - // Unmarshal minimally to read QueryType like we do for Twitter - minimal := &QueryTypeArgument{} - if err := unmarshalToStruct(args, minimal); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok arguments: %w", err) - } - if minimal.QueryType == types.CapEmpty { - defaultCap, exists := types.JobDefaultCapabilityMap[types.TiktokJob] - if !exists { - return nil, fmt.Errorf("no default capability configured for job type: %s", types.TiktokJob) - } - minimal.QueryType = defaultCap +func unmarshalTikTokArguments(args Args) (base.JobArgument, error) { + minimal := base.Arguments{} + if err := unmarshalToStruct(args, &minimal); err != nil { + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } - - switch minimal.QueryType { + switch minimal.Type { case types.CapSearchByQuery: - searchArgs := &TikTokSearchByQueryArguments{} + searchArgs := &tiktok.Query{} if err := unmarshalToStruct(args, searchArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok searchbyquery arguments: %w", err) - } - if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { - return nil, fmt.Errorf("tiktok job validation failed: %w", err) + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } return searchArgs, nil case types.CapSearchByTrending: - searchArgs := &TikTokSearchByTrendingArguments{} + searchArgs := &tiktok.Trending{} if err := unmarshalToStruct(args, searchArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok searchbytrending arguments: %w", err) - } - if err := searchArgs.ValidateForJobType(types.TiktokJob); err != nil { - return nil, fmt.Errorf("tiktok job validation failed: %w", err) + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } return searchArgs, nil case types.CapTranscription: - transcriptionArgs := &TikTokTranscriptionArguments{} + transcriptionArgs := &tiktok.Transcription{} if err := unmarshalToStruct(args, transcriptionArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal TikTok transcription arguments: %w", err) - } - if err := transcriptionArgs.ValidateForJobType(types.TiktokJob); err != nil { - return nil, fmt.Errorf("tiktok job validation failed: %w", err) + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } return transcriptionArgs, nil default: - return nil, fmt.Errorf("unknown tiktok type: %s", minimal.QueryType) + return nil, fmt.Errorf("%w: %s", ErrUnknownCapability, minimal.Type) } } -func unmarshalTwitterArguments(jobType types.JobType, args map[string]any) (*TwitterSearchArguments, error) { - twitterArgs := &TwitterSearchArguments{} +func unmarshalTwitterArguments(args Args) (*twitter.Search, error) { + twitterArgs := &twitter.Search{} if err := unmarshalToStruct(args, twitterArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal Twitter job arguments: %w", err) - } - - // If no QueryType is specified, use the default capability for this job type - if twitterArgs.QueryType == "" { - if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { - twitterArgs.QueryType = defaultCap - } + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } - - // Perform job-type-specific validation for Twitter - if err := twitterArgs.ValidateForJobType(jobType); err != nil { - return nil, fmt.Errorf("twitter job validation failed: %w", err) - } - return twitterArgs, nil } -func unmarshalLinkedInArguments(jobType types.JobType, args map[string]any) (JobArguments, error) { - minimal := &QueryTypeArgument{} - if err := unmarshalToStruct(args, minimal); err != nil { - return nil, fmt.Errorf("failed to unmarshal LinkedIn arguments: %w", err) - } - - if minimal.QueryType == types.CapEmpty { - if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { - minimal.QueryType = defaultCap - } - } - - switch minimal.QueryType { - case types.CapSearchByProfile: - linkedInArgs := &LinkedInProfileArguments{} - if err := unmarshalToStruct(args, linkedInArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal LinkedIn job arguments: %w", err) - } - if err := linkedInArgs.ValidateForJobType(jobType); err != nil { - return nil, fmt.Errorf("linkedin job validation failed: %w", err) - } - return linkedInArgs, nil - default: - return nil, fmt.Errorf("unknown linkedin type: %s", minimal.QueryType) +func unmarshalLinkedInArguments(args Args) (*linkedin.Profile, error) { + linkedInArgs := &linkedin.Profile{} + if err := unmarshalToStruct(args, linkedInArgs); err != nil { + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } + return linkedInArgs, nil } -func unmarshalRedditArguments(jobType types.JobType, args map[string]any) (*RedditArguments, error) { - redditArgs := &RedditArguments{} +func unmarshalRedditArguments(args Args) (*reddit.Search, error) { + redditArgs := &reddit.Search{} if err := unmarshalToStruct(args, redditArgs); err != nil { - return nil, fmt.Errorf("failed to unmarshal Reddit job arguments: %w", err) - } - - // If no QueryType is specified, use the default capability for this job type - if redditArgs.QueryType == "" { - if defaultCap, exists := types.JobDefaultCapabilityMap[jobType]; exists { - redditArgs.QueryType = types.RedditQueryType(defaultCap) - } + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } + return redditArgs, nil +} - // Perform job-type-specific validation for Reddit - if err := redditArgs.ValidateForJobType(jobType); err != nil { - return nil, fmt.Errorf("reddit job validation failed: %w", err) +func unmarshalTelemetryArguments(args Args) (*telemetry.Telemetry, error) { + telemetryArgs := &telemetry.Telemetry{} + if err := unmarshalToStruct(args, telemetryArgs); err != nil { + return nil, fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } - - return redditArgs, nil + return telemetryArgs, nil } // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal -// This provides the same functionality as the existing JobArguments.Unmarshal methods -func unmarshalToStruct(args map[string]any, target any) error { +// This provides the same functionality as the existing JobArgument.Unmarshal methods +func unmarshalToStruct(args Args, target any) error { // Use JSON marshal/unmarshal for conversion - this triggers our custom UnmarshalJSON methods data, err := json.Marshal(args) if err != nil { - return fmt.Errorf("failed to marshal arguments: %w", err) + return fmt.Errorf("%w: %w", ErrFailedToMarshal, err) } if err := json.Unmarshal(data, target); err != nil { - return fmt.Errorf("failed to unmarshal arguments: %w", err) + return fmt.Errorf("%w: %w", ErrFailedToUnmarshal, err) } return nil diff --git a/api/args/unmarshaller_test.go b/api/args/unmarshaller_test.go index d9d168f..b250cc7 100644 --- a/api/args/unmarshaller_test.go +++ b/api/args/unmarshaller_test.go @@ -5,6 +5,11 @@ import ( . "github.com/onsi/gomega" "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/reddit" + "github.com/masa-finance/tee-worker/api/args/telemetry" + "github.com/masa-finance/tee-worker/api/args/tiktok" + "github.com/masa-finance/tee-worker/api/args/twitter" + "github.com/masa-finance/tee-worker/api/args/web" "github.com/masa-finance/tee-worker/api/types" ) @@ -18,7 +23,7 @@ var _ = Describe("Unmarshaller", func() { } jobArgs, err := args.UnmarshalJobArguments(types.WebJob, argsMap) Expect(err).ToNot(HaveOccurred()) - webArgs, ok := jobArgs.(*args.WebArguments) + webArgs, ok := jobArgs.(*web.Page) Expect(ok).To(BeTrue()) Expect(webArgs.URL).To(Equal("https://example.com")) Expect(webArgs.MaxDepth).To(Equal(2)) @@ -28,12 +33,13 @@ var _ = Describe("Unmarshaller", func() { Context("with a TiktokJob", func() { It("should unmarshal the arguments correctly", func() { argsMap := map[string]any{ + "type": "transcription", "video_url": "https://www.tiktok.com/@user/video/123", "language": "en-us", } jobArgs, err := args.UnmarshalJobArguments(types.TiktokJob, argsMap) Expect(err).ToNot(HaveOccurred()) - tiktokArgs, ok := jobArgs.(*args.TikTokTranscriptionArguments) + tiktokArgs, ok := jobArgs.(*tiktok.Transcription) Expect(ok).To(BeTrue()) Expect(tiktokArgs.VideoURL).To(Equal("https://www.tiktok.com/@user/video/123")) Expect(tiktokArgs.Language).To(Equal("en-us")) @@ -49,21 +55,12 @@ var _ = Describe("Unmarshaller", func() { } jobArgs, err := args.UnmarshalJobArguments(types.TwitterJob, argsMap) Expect(err).ToNot(HaveOccurred()) - twitterArgs, ok := jobArgs.(*args.TwitterSearchArguments) + twitterArgs, ok := jobArgs.(*twitter.Search) Expect(ok).To(BeTrue()) - Expect(twitterArgs.QueryType).To(Equal(types.CapSearchByQuery)) + Expect(twitterArgs.Type).To(Equal(types.CapSearchByQuery)) Expect(twitterArgs.Query).To(Equal("golang")) Expect(twitterArgs.Count).To(Equal(10)) }) - - It("should set the default capability for TwitterApifyJob", func() { - argsMap := map[string]any{"query": "masa-finance"} - jobArgs, err := args.UnmarshalJobArguments(types.TwitterApifyJob, argsMap) - Expect(err).ToNot(HaveOccurred()) - twitterArgs, ok := jobArgs.(*args.TwitterSearchArguments) - Expect(ok).To(BeTrue()) - Expect(twitterArgs.GetCapability()).To(Equal(types.CapGetFollowers)) - }) }) Context("with a RedditJob", func() { @@ -75,18 +72,18 @@ var _ = Describe("Unmarshaller", func() { } jobArgs, err := args.UnmarshalJobArguments(types.RedditJob, argsMap) Expect(err).ToNot(HaveOccurred()) - redditArgs, ok := jobArgs.(*args.RedditArguments) + redditArgs, ok := jobArgs.(*reddit.Search) Expect(ok).To(BeTrue()) - Expect(redditArgs.QueryType).To(Equal(types.RedditQueryType("searchposts"))) + Expect(redditArgs.Type).To(Equal(types.CapSearchPosts)) }) }) Context("with a TelemetryJob", func() { - It("should return a TelemetryJobArguments struct", func() { + It("should return a TelemetryArguments struct", func() { argsMap := map[string]any{} jobArgs, err := args.UnmarshalJobArguments(types.TelemetryJob, argsMap) Expect(err).ToNot(HaveOccurred()) - _, ok := jobArgs.(*args.TelemetryJobArguments) + _, ok := jobArgs.(*telemetry.Arguments) Expect(ok).To(BeTrue()) }) }) diff --git a/api/args/web.go b/api/args/web.go deleted file mode 100644 index dac4642..0000000 --- a/api/args/web.go +++ /dev/null @@ -1,112 +0,0 @@ -package args - -import ( - "encoding/json" - "errors" - "fmt" - "net/url" - - teetypes "github.com/masa-finance/tee-worker/api/types" -) - -var ( - ErrWebURLRequired = errors.New("url is required") - ErrWebURLInvalid = errors.New("invalid URL format") - ErrWebURLSchemeMissing = errors.New("url must include a scheme (http:// or https://)") - ErrWebMaxDepth = errors.New("max depth must be non-negative") - ErrWebMaxPages = errors.New("max pages must be at least 1") -) - -const ( - WebDefaultMaxPages = 1 - WebDefaultMethod = "GET" - WebDefaultRespectRobotsTxtFile = false - WebDefaultSaveMarkdown = true -) - -type WebArguments struct { - QueryType teetypes.WebQueryType `json:"type"` - URL string `json:"url"` - MaxDepth int `json:"max_depth"` - MaxPages int `json:"max_pages"` -} - -// UnmarshalJSON implements custom JSON unmarshaling with validation -func (w *WebArguments) UnmarshalJSON(data []byte) error { - // Prevent infinite recursion (you call json.Unmarshal which then calls `UnmarshalJSON`, which then calls `json.Unmarshal`...) - type Alias WebArguments - aux := &struct { - *Alias - }{ - Alias: (*Alias)(w), - } - - if err := json.Unmarshal(data, aux); err != nil { - return fmt.Errorf("failed to unmarshal web arguments: %w", err) - } - - w.setDefaultValues() - - return w.Validate() -} - -func (w *WebArguments) setDefaultValues() { - if w.MaxPages == 0 { - w.MaxPages = WebDefaultMaxPages - } -} - -// Validate validates the Web arguments -func (w *WebArguments) Validate() error { - if w.URL == "" { - return ErrWebURLRequired - } - - // Validate URL format - parsedURL, err := url.Parse(w.URL) - if err != nil { - return fmt.Errorf("%w: %v", ErrWebURLInvalid, err) - } - - // Ensure URL has a scheme - if parsedURL.Scheme == "" { - return ErrWebURLSchemeMissing - } - - if w.MaxDepth < 0 { - return fmt.Errorf("%w: got %v", ErrWebMaxDepth, w.MaxDepth) - } - - if w.MaxPages < 1 { - return fmt.Errorf("%w: got %v", ErrWebMaxPages, w.MaxPages) - } - - return nil -} - -// ValidateForJobType validates Web arguments for a specific job type -func (w *WebArguments) ValidateForJobType(jobType teetypes.JobType) error { - if err := w.Validate(); err != nil { - return err - } - - // Validate capability against job-specific capabilities - return jobType.ValidateCapability(w.GetCapability()) -} - -// GetCapability returns the capability for web operations (always scraper) -func (w *WebArguments) GetCapability() teetypes.Capability { - return teetypes.CapScraper -} - -func (w WebArguments) ToWebScraperRequest() teetypes.WebScraperRequest { - return teetypes.WebScraperRequest{ - StartUrls: []teetypes.WebStartURL{ - {URL: w.URL, Method: WebDefaultMethod}, - }, - MaxCrawlDepth: w.MaxDepth, - MaxCrawlPages: w.MaxPages, - RespectRobotsTxtFile: WebDefaultRespectRobotsTxtFile, - SaveMarkdown: WebDefaultSaveMarkdown, - } -} diff --git a/api/args/web/page/page.go b/api/args/web/page/page.go new file mode 100644 index 0000000..8356a28 --- /dev/null +++ b/api/args/web/page/page.go @@ -0,0 +1,113 @@ +package page + +import ( + "encoding/json" + "errors" + "fmt" + "net/url" + + "github.com/masa-finance/tee-worker/api/args/base" + "github.com/masa-finance/tee-worker/api/types" +) + +var ( + ErrURLRequired = errors.New("url is required") + ErrURLInvalid = errors.New("invalid URL format") + ErrURLSchemeMissing = errors.New("url must include a scheme (http:// or https://)") + ErrMaxDepth = errors.New("max depth must be non-negative") + ErrMaxPages = errors.New("max pages must be at least 1") + ErrUnmarshalling = errors.New("failed to unmarshal web page arguments") +) + +const ( + DefaultMaxPages = 1 + DefaultMethod = "GET" + DefaultRespectRobotsTxtFile = false + DefaultSaveMarkdown = true +) + +// Verify interface implementation +var _ base.JobArgument = (*Arguments)(nil) + +type Arguments struct { + Type types.Capability `json:"type"` + URL string `json:"url"` + MaxDepth int `json:"max_depth"` + MaxPages int `json:"max_pages"` +} + +func (t *Arguments) UnmarshalJSON(data []byte) error { + type Alias Arguments + aux := &struct{ *Alias }{Alias: (*Alias)(t)} + if err := json.Unmarshal(data, aux); err != nil { + return fmt.Errorf("%w: %w", ErrUnmarshalling, err) + } + t.SetDefaultValues() + return t.Validate() +} + +func (w *Arguments) SetDefaultValues() { + if w.MaxPages == 0 { + w.MaxPages = DefaultMaxPages + } +} + +// Validate validates the arguments +func (w *Arguments) Validate() error { + err := w.ValidateCapability(types.WebJob) + if err != nil { + return err + } + + if w.URL == "" { + return ErrURLRequired + } + + // Validate URL format + parsedURL, err := url.Parse(w.URL) + if err != nil { + return fmt.Errorf("%w: %v", ErrURLInvalid, err) + } + + // Ensure URL has a scheme + if parsedURL.Scheme == "" { + return ErrURLSchemeMissing + } + + if w.MaxDepth < 0 { + return fmt.Errorf("%w: got %v", ErrMaxDepth, w.MaxDepth) + } + + if w.MaxPages < 1 { + return fmt.Errorf("%w: got %v", ErrMaxPages, w.MaxPages) + } + + return nil +} + +func (w *Arguments) GetCapability() types.Capability { + return w.Type +} + +func (w *Arguments) ValidateCapability(jobType types.JobType) error { + return jobType.ValidateCapability(&w.Type) +} + +func (w Arguments) ToScraperRequest() types.WebScraperRequest { + return types.WebScraperRequest{ + StartUrls: []types.WebStartURL{ + {URL: w.URL, Method: DefaultMethod}, + }, + MaxCrawlDepth: w.MaxDepth, + MaxCrawlPages: w.MaxPages, + RespectRobotsTxtFile: DefaultRespectRobotsTxtFile, + SaveMarkdown: DefaultSaveMarkdown, + } +} + +func NewArguments() Arguments { + args := Arguments{} + args.SetDefaultValues() + args.Validate() // This will set the default capability via ValidateCapability + return args +} diff --git a/api/args/args_suite_test.go b/api/args/web/page/page_suite_test.go similarity index 90% rename from api/args/args_suite_test.go rename to api/args/web/page/page_suite_test.go index 861e0bf..8da7b90 100644 --- a/api/args/args_suite_test.go +++ b/api/args/web/page/page_suite_test.go @@ -1,4 +1,4 @@ -package args_test +package page_test import ( "testing" diff --git a/api/args/web_test.go b/api/args/web/page/page_test.go similarity index 54% rename from api/args/web_test.go rename to api/args/web/page/page_test.go index fecf831..523c647 100644 --- a/api/args/web_test.go +++ b/api/args/web/page/page_test.go @@ -1,4 +1,4 @@ -package args_test +package page_test import ( "encoding/json" @@ -7,19 +7,17 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/web/page" "github.com/masa-finance/tee-worker/api/types" ) var _ = Describe("WebArguments", func() { Describe("Marshalling and unmarshalling", func() { It("should set default values", func() { - webArgs := args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: 0, - MaxPages: 0, - } + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = 0 + webArgs.MaxPages = 0 jsonData, err := json.Marshal(webArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &webArgs) @@ -28,12 +26,10 @@ var _ = Describe("WebArguments", func() { }) It("should override default values", func() { - webArgs := args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: 2, - MaxPages: 5, - } + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = 2 + webArgs.MaxPages = 5 jsonData, err := json.Marshal(webArgs) Expect(err).ToNot(HaveOccurred()) err = json.Unmarshal([]byte(jsonData), &webArgs) @@ -42,110 +38,94 @@ var _ = Describe("WebArguments", func() { }) It("should fail unmarshal when url is missing", func() { - var webArgs args.WebArguments + var webArgs page.Arguments jsonData := []byte(`{"type":"scraper","max_depth":1,"max_pages":1}`) err := json.Unmarshal(jsonData, &webArgs) - Expect(errors.Is(err, args.ErrWebURLRequired)).To(BeTrue()) + Expect(errors.Is(err, page.ErrURLRequired)).To(BeTrue()) }) }) Describe("Validation", func() { It("should succeed with valid arguments", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: 2, - MaxPages: 3, - } + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = 2 + webArgs.MaxPages = 3 err := webArgs.Validate() Expect(err).ToNot(HaveOccurred()) }) It("should fail when url is missing", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - MaxDepth: 0, - MaxPages: 1, - } + webArgs := page.NewArguments() + webArgs.MaxDepth = 0 + webArgs.MaxPages = 1 err := webArgs.Validate() - Expect(errors.Is(err, args.ErrWebURLRequired)).To(BeTrue()) + Expect(errors.Is(err, page.ErrURLRequired)).To(BeTrue()) }) It("should fail with an invalid URL format", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - URL: "http:// invalid.com", - MaxDepth: 0, - MaxPages: 1, - } + webArgs := page.NewArguments() + webArgs.URL = "http:// invalid.com" + webArgs.MaxDepth = 0 + webArgs.MaxPages = 1 err := webArgs.Validate() - Expect(errors.Is(err, args.ErrWebURLInvalid)).To(BeTrue()) + Expect(errors.Is(err, page.ErrURLInvalid)).To(BeTrue()) Expect(err.Error()).To(ContainSubstring("invalid URL format")) }) It("should fail when scheme is missing", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - URL: "example.com", - MaxDepth: 0, - MaxPages: 1, - } + webArgs := page.NewArguments() + webArgs.URL = "example.com" + webArgs.MaxDepth = 0 + webArgs.MaxPages = 1 err := webArgs.Validate() - Expect(errors.Is(err, args.ErrWebURLSchemeMissing)).To(BeTrue()) + Expect(errors.Is(err, page.ErrURLSchemeMissing)).To(BeTrue()) }) It("should fail when max depth is negative", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: -1, - MaxPages: 1, - } + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = -1 + webArgs.MaxPages = 1 err := webArgs.Validate() - Expect(errors.Is(err, args.ErrWebMaxDepth)).To(BeTrue()) + Expect(errors.Is(err, page.ErrMaxDepth)).To(BeTrue()) Expect(err.Error()).To(ContainSubstring("got -1")) }) It("should fail when max pages is less than 1", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: 0, - MaxPages: 0, - } + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = 0 + webArgs.MaxPages = 0 err := webArgs.Validate() - Expect(errors.Is(err, args.ErrWebMaxPages)).To(BeTrue()) + Expect(errors.Is(err, page.ErrMaxPages)).To(BeTrue()) Expect(err.Error()).To(ContainSubstring("got 0")) }) }) Describe("Job capability", func() { It("should return the scraper capability", func() { - webArgs := &args.WebArguments{} + webArgs := page.NewArguments() Expect(webArgs.GetCapability()).To(Equal(types.CapScraper)) }) It("should validate capability for WebJob", func() { - webArgs := &args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: 1, - MaxPages: 1, - } - err := webArgs.ValidateForJobType(types.WebJob) + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = 1 + webArgs.MaxPages = 1 + err := webArgs.ValidateCapability(types.WebJob) Expect(err).ToNot(HaveOccurred()) }) }) Describe("ToWebScraperRequest", func() { It("should map fields correctly", func() { - webArgs := args.WebArguments{ - QueryType: types.WebScraper, - URL: "https://example.com", - MaxDepth: 2, - MaxPages: 3, - } - req := webArgs.ToWebScraperRequest() + webArgs := page.NewArguments() + webArgs.URL = "https://example.com" + webArgs.MaxDepth = 2 + webArgs.MaxPages = 3 + req := webArgs.ToScraperRequest() Expect(req.StartUrls).To(HaveLen(1)) Expect(req.StartUrls[0].URL).To(Equal("https://example.com")) Expect(req.StartUrls[0].Method).To(Equal("GET")) diff --git a/api/args/web/web.go b/api/args/web/web.go new file mode 100644 index 0000000..5f89720 --- /dev/null +++ b/api/args/web/web.go @@ -0,0 +1,7 @@ +package web + +import ( + "github.com/masa-finance/tee-worker/api/args/web/page" +) + +type Page = page.Arguments diff --git a/api/types/jobs.go b/api/types/jobs.go index a48dfa2..c28dff7 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -10,10 +10,13 @@ import ( ) type JobType string +type Capability string + +type WorkerCapabilities map[JobType][]Capability -type JobArguments map[string]interface{} +type JobArguments map[string]any -func (ja JobArguments) Unmarshal(i interface{}) error { +func (ja JobArguments) Unmarshal(i any) error { dat, err := json.Marshal(ja) if err != nil { return err @@ -35,24 +38,32 @@ func (j Job) String() string { return fmt.Sprintf("UUID: %s Type: %s Arguments: %s", j.UUID, j.Type, j.Arguments) } -type Capability string -type WorkerCapabilities map[JobType][]Capability - // String returns the string representation of the JobType func (j JobType) String() string { return string(j) } // ValidateCapability validates that a capability is supported for this job type -func (j JobType) ValidateCapability(capability Capability) error { +// If the capability is CapEmpty, it will be set to the default capability for the job type +func (j JobType) ValidateCapability(capability *Capability) error { + // Set default capability if empty + if *capability == CapEmpty { + defaultCap, exists := JobDefaultCapabilityMap[j] + if !exists { + return fmt.Errorf("no default capability configured for job type: %s", j) + } + *capability = defaultCap + } + + // Validate the capability validCaps, exists := JobCapabilityMap[j] if !exists { return fmt.Errorf("unknown job type: %s", j) } - if !slices.Contains(validCaps, capability) { + if !slices.Contains(validCaps, *capability) { return fmt.Errorf("capability '%s' is not valid for job type '%s'. valid capabilities: %v", - capability, j, validCaps) + *capability, j, validCaps) } return nil @@ -69,39 +80,42 @@ func combineCapabilities(capSlices ...[]Capability) []Capability { // Job type constants - centralized from tee-indexer and tee-worker const ( - WebJob JobType = "web" - TelemetryJob JobType = "telemetry" - TiktokJob JobType = "tiktok" - TwitterJob JobType = "twitter" // General Twitter scraping (uses best available auth for capability) - TwitterCredentialJob JobType = "twitter-credential" // Twitter scraping with credentials - TwitterApiJob JobType = "twitter-api" // Twitter scraping with API keys - TwitterApifyJob JobType = "twitter-apify" // Twitter scraping with Apify - LinkedInJob JobType = "linkedin" // LinkedIn scraping, keeping for unmarshalling logic - RedditJob JobType = "reddit" // Reddit scraping with Apify + WebJob JobType = "web" + TelemetryJob JobType = "telemetry" + TiktokJob JobType = "tiktok" + TwitterJob JobType = "twitter" + LinkedInJob JobType = "linkedin" + RedditJob JobType = "reddit" ) // Capability constants - typed to prevent typos and enable discoverability const ( - CapScraper Capability = "scraper" - CapTelemetry Capability = "telemetry" - CapTranscription Capability = "transcription" - CapSearchByQuery Capability = "searchbyquery" - CapSearchByTrending Capability = "searchbytrending" + + // Twitter (credential-based) capabilities + CapSearchByQuery Capability = "searchbyquery" + CapSearchByProfile Capability = "searchbyprofile" + CapGetById Capability = "getbyid" + CapGetReplies Capability = "getreplies" + CapGetRetweeters Capability = "getretweeters" + CapGetMedia Capability = "getmedia" + CapGetProfileById Capability = "getprofilebyid" + CapGetTrends Capability = "gettrends" + CapGetSpace Capability = "getspace" + CapGetProfile Capability = "getprofile" + CapGetTweets Capability = "gettweets" + + // Twitter (apify-based) capabilities + CapGetFollowing Capability = "getfollowing" + CapGetFollowers Capability = "getfollowers" + + // Twitter (api-based) capabilities CapSearchByFullArchive Capability = "searchbyfullarchive" - CapSearchByProfile Capability = "searchbyprofile" - CapGetById Capability = "getbyid" - CapGetReplies Capability = "getreplies" - CapGetRetweeters Capability = "getretweeters" - CapGetTweets Capability = "gettweets" - CapGetMedia Capability = "getmedia" - CapGetHomeTweets Capability = "gethometweets" - CapGetForYouTweets Capability = "getforyoutweets" - CapGetProfileById Capability = "getprofilebyid" - CapGetTrends Capability = "gettrends" - CapGetFollowing Capability = "getfollowing" - CapGetFollowers Capability = "getfollowers" - CapGetSpace Capability = "getspace" - CapGetProfile Capability = "getprofile" + + CapScraper Capability = "scraper" + CapSearchByTrending Capability = "searchbytrending" + CapTelemetry Capability = "telemetry" + CapTranscription Capability = "transcription" + // Reddit capabilities CapScrapeUrls Capability = "scrapeurls" CapSearchPosts Capability = "searchposts" @@ -122,21 +136,13 @@ var ( TiktokJob: AlwaysAvailableTiktokCaps, } - // TwitterCredentialCaps are all Twitter capabilities available with credential-based auth - TwitterCredentialCaps = []Capability{ - CapSearchByQuery, CapSearchByProfile, - CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, - CapGetHomeTweets, CapGetForYouTweets, CapGetProfileById, - CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, - CapEmpty, + // Twitter capabilities + TwitterCaps = []Capability{ + CapSearchByQuery, CapSearchByProfile, CapSearchByFullArchive, + CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, CapGetProfileById, + CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, CapEmpty, } - // TwitterAPICaps are basic Twitter capabilities available with API keys - TwitterAPICaps = []Capability{CapSearchByQuery, CapGetById, CapGetProfileById, CapEmpty} - - // TwitterApifyCaps are Twitter capabilities available with Apify - TwitterApifyCaps = []Capability{CapGetFollowers, CapGetFollowing, CapEmpty} - // TiktokSearchCaps are Tiktok capabilities available with Apify TiktokSearchCaps = []Capability{CapSearchByQuery, CapSearchByTrending} @@ -152,19 +158,8 @@ var ( // JobCapabilityMap defines which capabilities are valid for each job type var JobCapabilityMap = map[JobType][]Capability{ - // Twitter job types and their valid capabilities - TwitterJob: combineCapabilities( - TwitterCredentialCaps, - TwitterAPICaps, - TwitterApifyCaps, - []Capability{CapSearchByFullArchive}, // Elevated API capability - ), - TwitterCredentialJob: TwitterCredentialCaps, - TwitterApiJob: combineCapabilities( - TwitterAPICaps, - []Capability{CapSearchByFullArchive}, // Elevated API capability - ), - TwitterApifyJob: TwitterApifyCaps, + // Twitter job capabilities + TwitterJob: TwitterCaps, // Web job capabilities WebJob: WebCaps, @@ -187,15 +182,12 @@ var JobCapabilityMap = map[JobType][]Capability{ // if no capability is specified, use the default capability for the job type var JobDefaultCapabilityMap = map[JobType]Capability{ - TwitterJob: CapSearchByQuery, - TwitterCredentialJob: CapSearchByQuery, - TwitterApiJob: CapSearchByQuery, - TwitterApifyJob: CapGetFollowers, - WebJob: CapScraper, - TiktokJob: CapTranscription, - RedditJob: CapScrapeUrls, - TelemetryJob: CapTelemetry, - LinkedInJob: CapSearchByProfile, + TwitterJob: CapSearchByQuery, + WebJob: CapScraper, + TiktokJob: CapTranscription, + RedditJob: CapScrapeUrls, + TelemetryJob: CapTelemetry, + LinkedInJob: CapSearchByProfile, } // JobResponse represents a response to a job submission diff --git a/api/types/linkedin/linkedin.go b/api/types/linkedin/linkedin.go index 4f1dbf5..c0d3b22 100644 --- a/api/types/linkedin/linkedin.go +++ b/api/types/linkedin/linkedin.go @@ -13,6 +13,7 @@ type LinkedInConfig struct { Seniorities *seniorities.SenioritiesConfig Functions *functions.FunctionsConfig Industries *industries.IndustriesConfig + Profile *profile.Profile } var LinkedIn = LinkedInConfig{ @@ -20,6 +21,7 @@ var LinkedIn = LinkedInConfig{ Seniorities: &seniorities.Seniorities, Functions: &functions.Functions, Industries: &industries.Industries, + Profile: &profile.Profile{}, } -type Profile = *profile.Profile +type Profile = profile.Profile diff --git a/api/types/reddit.go b/api/types/reddit.go index f342b42..dec89e4 100644 --- a/api/types/reddit.go +++ b/api/types/reddit.go @@ -8,16 +8,7 @@ import ( "github.com/masa-finance/tee-worker/pkg/util" ) -type RedditQueryType string - -const ( - RedditScrapeUrls RedditQueryType = "scrapeurls" - RedditSearchPosts RedditQueryType = "searchposts" - RedditSearchUsers RedditQueryType = "searchusers" - RedditSearchCommunities RedditQueryType = "searchcommunities" -) - -var AllRedditQueryTypes = util.NewSet(RedditScrapeUrls, RedditSearchPosts, RedditSearchUsers, RedditSearchCommunities) +var AllRedditQueryTypes = util.NewSet(CapScrapeUrls, CapSearchPosts, CapSearchUsers, CapSearchCommunities) type RedditSortType string diff --git a/internal/apify/actors.go b/internal/apify/actors.go index 75631a1..a349e70 100644 --- a/internal/apify/actors.go +++ b/internal/apify/actors.go @@ -58,8 +58,8 @@ var Actors = []ActorConfig{ { ActorId: ActorIds.TwitterFollowers, DefaultInput: defaultActorInput{"maxFollowers": 200, "maxFollowings": 200}, - Capabilities: types.TwitterApifyCaps, - JobType: types.TwitterApifyJob, + Capabilities: []types.Capability{types.CapGetFollowing, types.CapGetFollowers}, + JobType: types.TwitterJob, }, { ActorId: ActorIds.WebScraper, diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index b1fa4e9..707f3f6 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -42,22 +42,39 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface hasApifyKey := hasValidApifyKey(apifyApiKey) hasLLMKey := geminiApiKey.IsValid() || claudeApiKey.IsValid() - // Add Twitter-specific capabilities based on available authentication - if hasAccounts { - capabilities[types.TwitterCredentialJob] = types.TwitterCredentialCaps - } + // Add Twitter capabilities based on available authentication + if hasAccounts || hasApiKeys { + var twitterCaps []types.Capability - if hasApiKeys { - // Start with basic API capabilities - apiCaps := make([]types.Capability, len(types.TwitterAPICaps)) - copy(apiCaps, types.TwitterAPICaps) + // Add credential-based capabilities if we have accounts + if hasAccounts { + twitterCaps = append(twitterCaps, + types.CapSearchByQuery, + types.CapSearchByProfile, + types.CapGetById, + types.CapGetReplies, + types.CapGetRetweeters, + types.CapGetMedia, + types.CapGetProfileById, + types.CapGetTrends, + types.CapGetSpace, + types.CapGetProfile, + types.CapGetTweets, + ) + } - // Check for elevated API keys and add searchbyfullarchive capability - if hasElevatedApiKey(apiKeys) { - apiCaps = append(apiCaps, types.CapSearchByFullArchive) + // Add API-based capabilities if we have API keys + if hasApiKeys { + // Check for elevated API capabilities + if hasElevatedApiKey(apiKeys) { + twitterCaps = append(twitterCaps, types.CapSearchByFullArchive) + } } - capabilities[types.TwitterApiJob] = apiCaps + // Only add capabilities if we have any supported capabilities + if len(twitterCaps) > 0 { + capabilities[types.TwitterJob] = twitterCaps + } } if hasApifyKey { @@ -94,32 +111,6 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface } } - // Add general TwitterJob capability if any Twitter auth is available - // TODO: this will get cleaned up with unique twitter capabilities - if hasAccounts || hasApiKeys || hasApifyKey { - var twitterJobCaps []types.Capability - // Use the most comprehensive capabilities available - if hasAccounts { - twitterJobCaps = types.TwitterCredentialCaps - } else { - // Use API capabilities if we only have keys - twitterJobCaps = make([]types.Capability, len(types.TwitterAPICaps)) - copy(twitterJobCaps, types.TwitterAPICaps) - - // Check for elevated API keys and add searchbyfullarchive capability - if hasElevatedApiKey(apiKeys) { - twitterJobCaps = append(twitterJobCaps, types.CapSearchByFullArchive) - } - } - - // Add Apify capabilities if available - if hasApifyKey { - twitterJobCaps = append(twitterJobCaps, types.TwitterApifyCaps...) - } - - capabilities[types.TwitterJob] = twitterJobCaps - } - return capabilities } diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index 877892e..8febcfc 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -72,36 +72,10 @@ var _ = Describe("DetectCapabilities", func() { "twitter_accounts": []string{"account1", "account2"}, }, nil, - types.WorkerCapabilities{ - types.TelemetryJob: {types.CapTelemetry}, - types.TiktokJob: {types.CapTranscription}, - types.TwitterCredentialJob: types.TwitterCredentialCaps, - types.TwitterJob: types.TwitterCredentialCaps, - }, - ), - Entry("With Twitter API keys - adds API capabilities", - config.JobConfiguration{ - "twitter_api_keys": []string{"key1", "key2"}, - }, - nil, - types.WorkerCapabilities{ - types.TelemetryJob: {types.CapTelemetry}, - types.TiktokJob: {types.CapTranscription}, - types.TwitterApiJob: types.TwitterAPICaps, - types.TwitterJob: types.TwitterAPICaps, - }, - ), - Entry("With mock elevated Twitter API keys - only basic capabilities detected", - config.JobConfiguration{ - "twitter_api_keys": []string{"Bearer abcd1234-ELEVATED"}, - }, - nil, types.WorkerCapabilities{ types.TelemetryJob: {types.CapTelemetry}, types.TiktokJob: {types.CapTranscription}, - // Note: Mock elevated keys will be detected as basic since we can't make real API calls in tests - types.TwitterApiJob: types.TwitterAPICaps, - types.TwitterJob: types.TwitterAPICaps, + types.TwitterJob: types.TwitterCaps, }, ), ) @@ -164,7 +138,7 @@ var _ = Describe("DetectCapabilities", func() { Expect(tiktokCaps).To(ContainElement(types.CapSearchByTrending), "expected tiktok to include CapSearchByTrending capability") // Twitter-Apify job should be present with follower/following capabilities - twitterApifyCaps, ok := caps[types.TwitterApifyJob] + twitterApifyCaps, ok := caps[types.TwitterJob] Expect(ok).To(BeTrue(), "expected twitter-apify capabilities to be present") Expect(twitterApifyCaps).To(ContainElement(types.CapGetFollowers), "expected twitter-apify to include CapGetFollowers capability") Expect(twitterApifyCaps).To(ContainElement(types.CapGetFollowing), "expected twitter-apify to include CapGetFollowing capability") diff --git a/internal/config/config.go b/internal/config/config.go index 1be62ab..c5e8938 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -13,7 +13,7 @@ import ( "github.com/joho/godotenv" "github.com/sirupsen/logrus" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/llm/process" ) var ( @@ -347,9 +347,9 @@ type LlmConfig struct { // GetModelAndKey returns the first available model and API key based on which keys are valid func (lc LlmConfig) GetModelAndKey() (model string, key string, err error) { if lc.ClaudeApiKey.IsValid() { - return args.LLMDefaultClaudeModel, string(lc.ClaudeApiKey), nil + return process.DefaultClaudeModel, string(lc.ClaudeApiKey), nil } else if lc.GeminiApiKey.IsValid() { - return args.LLMDefaultGeminiModel, string(lc.GeminiApiKey), nil + return process.DefaultGeminiModel, string(lc.GeminiApiKey), nil } return "", "", errors.New("no valid llm api key found") } diff --git a/internal/jobs/linkedin.go b/internal/jobs/linkedin.go index 4175fff..afe7a91 100644 --- a/internal/jobs/linkedin.go +++ b/internal/jobs/linkedin.go @@ -14,13 +14,13 @@ import ( "github.com/masa-finance/tee-worker/internal/jobs/stats" "github.com/masa-finance/tee-worker/pkg/client" - profileArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" - profileTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" + pArgs "github.com/masa-finance/tee-worker/api/args/linkedin/profile" + pTypes "github.com/masa-finance/tee-worker/api/types/linkedin/profile" ) // LinkedInApifyClient defines the interface for the LinkedIn Apify client to allow mocking in tests type LinkedInApifyClient interface { - SearchProfiles(workerID string, args *profileArgs.Arguments, cursor client.Cursor) ([]*profileTypes.Profile, string, client.Cursor, error) + SearchProfiles(workerID string, args *pArgs.Arguments, cursor client.Cursor) ([]*pTypes.Profile, string, client.Cursor, error) ValidateApiKey() error } @@ -61,7 +61,7 @@ func (ls *LinkedInScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: msg.Error()}, msg } - linkedinArgs, ok := jobArgs.(*profileArgs.Arguments) + linkedinArgs, ok := jobArgs.(*pArgs.Arguments) if !ok { return types.JobResult{Error: "invalid argument type for LinkedIn job"}, errors.New("invalid argument type") } @@ -92,16 +92,3 @@ func (ls *LinkedInScraper) ExecuteJob(j types.Job) (types.JobResult, error) { NextCursor: cursor.String(), }, nil } - -// GetStructuredCapabilities returns the structured capabilities supported by the LinkedIn scraper -// based on the available credentials and API keys -func (ls *LinkedInScraper) GetStructuredCapabilities() types.WorkerCapabilities { - capabilities := make(types.WorkerCapabilities) - - apifyApiKey := ls.configuration.GetString("apify_api_key", "") - if apifyApiKey != "" { - capabilities[types.LinkedInJob] = types.LinkedInCaps - } - - return capabilities -} diff --git a/internal/jobs/linkedin_test.go b/internal/jobs/linkedin_test.go index 634e8da..91cfa6a 100644 --- a/internal/jobs/linkedin_test.go +++ b/internal/jobs/linkedin_test.go @@ -130,7 +130,7 @@ var _ = Describe("LinkedInScraper", func() { Expect(workerID).To(Equal("test-worker")) Expect(args.Query).To(Equal("software engineer")) Expect(args.MaxItems).To(Equal(uint(10))) - Expect(args.QueryType).To(Equal(types.CapSearchByProfile)) + Expect(args.Type).To(Equal(types.CapSearchByProfile)) return expectedProfiles, "dataset-123", client.Cursor("next-cursor"), nil } @@ -244,37 +244,6 @@ var _ = Describe("LinkedInScraper", func() { }) }) - Context("GetStructuredCapabilities", func() { - It("should return LinkedIn capabilities when Apify API key is present", func() { - cfg := config.JobConfiguration{ - "apify_api_key": "test-key", - } - scraper = jobs.NewLinkedInScraper(cfg, statsCollector) - - capabilities := scraper.GetStructuredCapabilities() - Expect(capabilities).To(HaveKey(types.LinkedInJob)) - Expect(capabilities[types.LinkedInJob]).To(ContainElement(types.CapSearchByProfile)) - }) - - It("should return empty capabilities when Apify API key is missing", func() { - cfg := config.JobConfiguration{} - scraper = jobs.NewLinkedInScraper(cfg, statsCollector) - - capabilities := scraper.GetStructuredCapabilities() - Expect(capabilities).NotTo(HaveKey(types.LinkedInJob)) - }) - - It("should return empty capabilities when Apify API key is empty", func() { - cfg := config.JobConfiguration{ - "apify_api_key": "", - } - scraper = jobs.NewLinkedInScraper(cfg, statsCollector) - - capabilities := scraper.GetStructuredCapabilities() - Expect(capabilities).NotTo(HaveKey(types.LinkedInJob)) - }) - }) - // Integration tests that use the real client Context("Integration tests", func() { var apifyKey string @@ -300,12 +269,12 @@ var _ = Describe("LinkedInScraper", func() { integrationScraper := jobs.NewLinkedInScraper(cfg, integrationStatsCollector) jobArgs := profileArgs.Arguments{ - QueryType: types.CapSearchByProfile, - Query: "software engineer", - MaxItems: 10, + Type: types.CapSearchByProfile, + Query: "software engineer", + MaxItems: 10, } - // Marshal jobArgs to map[string]any so it can be used as JobArguments + // Marshal jobArgs to map[string]any so it can be used as JobArgument var jobArgsMap map[string]any jobArgsBytes, err := json.Marshal(jobArgs) Expect(err).NotTo(HaveOccurred()) @@ -337,22 +306,7 @@ var _ = Describe("LinkedInScraper", func() { fmt.Println(string(prettyJSON)) }) - It("should expose capabilities only when APIFY_API_KEY is present", func() { - cfg := config.JobConfiguration{ - "apify_api_key": apifyKey, - } - integrationStatsCollector := stats.StartCollector(128, cfg) - integrationScraper := jobs.NewLinkedInScraper(cfg, integrationStatsCollector) - - caps := integrationScraper.GetStructuredCapabilities() - if apifyKey != "" { - Expect(caps[types.LinkedInJob]).NotTo(BeEmpty()) - Expect(caps[types.LinkedInJob]).To(ContainElement(types.CapSearchByProfile)) - } else { - // Expect no capabilities when key is missing - _, ok := caps[types.LinkedInJob] - Expect(ok).To(BeFalse()) - } - }) + // Note: Capability detection is now centralized in capabilities/detector.go + // Individual scraper capability tests have been removed }) }) diff --git a/internal/jobs/linkedinapify/client_test.go b/internal/jobs/linkedinapify/client_test.go index 5910204..3b56f1b 100644 --- a/internal/jobs/linkedinapify/client_test.go +++ b/internal/jobs/linkedinapify/client_test.go @@ -207,7 +207,7 @@ var _ = Describe("LinkedInApifyClient", func() { Expect(err).NotTo(HaveOccurred()) args := profileArgs.Arguments{ - QueryType: types.CapSearchByProfile, + Type: types.CapSearchByProfile, Query: "software engineer", MaxItems: 1, ScraperMode: profile.ScraperModeShort, diff --git a/internal/jobs/llmapify/client.go b/internal/jobs/llmapify/client.go index c367e2b..d7d7309 100644 --- a/internal/jobs/llmapify/client.go +++ b/internal/jobs/llmapify/client.go @@ -5,7 +5,7 @@ import ( "errors" "fmt" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/llm" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" @@ -54,7 +54,7 @@ func (c *ApifyClient) ValidateApiKey() error { return c.client.ValidateApiKey() } -func (c *ApifyClient) Process(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { +func (c *ApifyClient) Process(workerID string, args llm.Process, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.LLMQueries, 1) } @@ -64,7 +64,7 @@ func (c *ApifyClient) Process(workerID string, args args.LLMProcessorArguments, return nil, client.EmptyCursor, err } - input, err := args.ToLLMProcessorRequest(model, key) + input, err := args.ToProcessorRequest(model, key) if err != nil { return nil, client.EmptyCursor, err } diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index bdd601d..292788e 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -10,7 +10,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/llm/process" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/config" @@ -67,7 +67,7 @@ var _ = Describe("LLMApifyClient", func() { Describe("Process", func() { It("should construct the correct actor input", func() { - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -88,10 +88,10 @@ var _ = Describe("LLMApifyClient", func() { Expect(request.InputDatasetId).To(Equal("test-dataset-id")) Expect(request.Prompt).To(Equal("test-prompt")) Expect(request.LLMProviderApiKey).To(Equal("test-claude-llm-key")) // should be set from constructor - Expect(request.Model).To(Equal(args.LLMDefaultClaudeModel)) // default model - Expect(request.MultipleColumns).To(Equal(args.LLMDefaultMultipleColumns)) // default value - Expect(request.MaxTokens).To(Equal(args.LLMDefaultMaxTokens)) // default value - Expect(request.Temperature).To(Equal(strconv.FormatFloat(args.LLMDefaultTemperature, 'f', -1, 64))) // default value + Expect(request.Model).To(Equal(process.DefaultClaudeModel)) // default model + Expect(request.MultipleColumns).To(Equal(process.DefaultMultipleColumns)) // default value + Expect(request.MaxTokens).To(Equal(process.DefaultMaxTokens)) // default value + Expect(request.Temperature).To(Equal(strconv.FormatFloat(process.DefaultTemperature, 'f', -1, 64))) // default value return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } @@ -106,7 +106,7 @@ var _ = Describe("LLMApifyClient", func() { return nil, "", expectedErr } - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -125,7 +125,7 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -147,7 +147,7 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -174,7 +174,7 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", } @@ -186,7 +186,7 @@ var _ = Describe("LLMApifyClient", func() { }) It("should use custom values when provided", func() { - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "test-dataset-id", Prompt: "test-prompt", MaxTokens: 500, @@ -257,7 +257,7 @@ var _ = Describe("LLMApifyClient", func() { realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: config.LlmApiKey(geminiKey)}, nil) Expect(err).NotTo(HaveOccurred()) - llmArgs := args.LLMProcessorArguments{ + llmArgs := process.Arguments{ DatasetId: "V6tyuuZIgfiETl1cl", Prompt: "summarize the content of this webpage ${markdown}", } diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index 5180d73..dc215da 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -10,6 +10,7 @@ import ( "github.com/sirupsen/logrus" "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/reddit/search" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" @@ -58,7 +59,7 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } // Type assert to Reddit arguments - redditArgs, ok := jobArgs.(*args.RedditArguments) + redditArgs, ok := jobArgs.(*search.Arguments) if !ok { return types.JobResult{Error: "invalid argument type for Reddit job"}, errors.New("invalid argument type") } @@ -72,8 +73,8 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { commonArgs := redditapify.CommonArgs{} commonArgs.CopyFromArgs(redditArgs) - switch redditArgs.QueryType { - case types.RedditScrapeUrls: + switch redditArgs.Type { + case types.CapScrapeUrls: urls := make([]types.RedditStartURL, 0, len(redditArgs.URLs)) for _, u := range redditArgs.URLs { urls = append(urls, types.RedditStartURL{ @@ -85,20 +86,20 @@ func (r *RedditScraper) ExecuteJob(j types.Job) (types.JobResult, error) { resp, cursor, err := redditClient.ScrapeUrls(j.WorkerID, urls, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) - case types.RedditSearchUsers: + case types.CapSearchUsers: resp, cursor, err := redditClient.SearchUsers(j.WorkerID, redditArgs.Queries, redditArgs.SkipPosts, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) - case types.RedditSearchPosts: + case types.CapSearchPosts: resp, cursor, err := redditClient.SearchPosts(j.WorkerID, redditArgs.Queries, redditArgs.After, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) - case types.RedditSearchCommunities: + case types.CapSearchCommunities: resp, cursor, err := redditClient.SearchCommunities(j.WorkerID, redditArgs.Queries, commonArgs, client.Cursor(redditArgs.NextCursor), redditArgs.MaxResults) return processRedditResponse(j, resp, cursor, err) default: - return types.JobResult{Error: "invalid type for Reddit job"}, fmt.Errorf("invalid type for Reddit job: %s", redditArgs.QueryType) + return types.JobResult{Error: "invalid type for Reddit job"}, fmt.Errorf("invalid type for Reddit job: %s", redditArgs.Type) } } @@ -117,17 +118,3 @@ func processRedditResponse(j types.Job, resp []*types.RedditResponse, cursor cli NextCursor: cursor.String(), }, nil } - -// GetStructuredCapabilities returns the structured capabilities supported by this Twitter scraper -// based on the available credentials and API keys -func (rs *RedditScraper) GetStructuredCapabilities() types.WorkerCapabilities { - capabilities := make(types.WorkerCapabilities) - - // Add Apify-specific capabilities based on available API key - // TODO: We should verify whether each of the actors is actually available through this API key - if rs.configuration.ApifyApiKey != "" { - capabilities[types.RedditJob] = types.RedditCaps - } - - return capabilities -} diff --git a/internal/jobs/reddit_test.go b/internal/jobs/reddit_test.go index 7de1d02..344dbf8 100644 --- a/internal/jobs/reddit_test.go +++ b/internal/jobs/reddit_test.go @@ -97,7 +97,7 @@ var _ = Describe("RedditScraper", func() { "https://www.reddit.com/r/HHGTTG/comments/1jynlrz/the_entire_series_after_restaurant_at_the_end_of/", } job.Arguments = map[string]any{ - "type": types.RedditScrapeUrls, + "type": types.CapScrapeUrls, "urls": testUrls, } @@ -121,7 +121,7 @@ var _ = Describe("RedditScraper", func() { It("should call SearchUsers for the correct QueryType", func() { job.Arguments = map[string]any{ - "type": types.RedditSearchUsers, + "type": types.CapSearchUsers, "queries": []string{"user-query"}, } @@ -144,7 +144,7 @@ var _ = Describe("RedditScraper", func() { It("should call SearchPosts for the correct QueryType", func() { job.Arguments = map[string]any{ - "type": types.RedditSearchPosts, + "type": types.CapSearchPosts, "queries": []string{"post-query"}, } @@ -167,7 +167,7 @@ var _ = Describe("RedditScraper", func() { It("should call SearchCommunities for the correct QueryType", func() { job.Arguments = map[string]any{ - "type": types.RedditSearchCommunities, + "type": types.CapSearchCommunities, "queries": []string{"community-query"}, } @@ -201,7 +201,7 @@ var _ = Describe("RedditScraper", func() { It("should handle errors from the reddit client", func() { job.Arguments = map[string]any{ - "type": types.RedditSearchPosts, + "type": types.CapSearchPosts, "queries": []string{"post-query"}, } @@ -221,7 +221,7 @@ var _ = Describe("RedditScraper", func() { return nil, errors.New("client creation failed") } job.Arguments = map[string]any{ - "type": types.RedditSearchPosts, + "type": types.CapSearchPosts, "queries": []string{"post-query"}, } diff --git a/internal/jobs/redditapify/client.go b/internal/jobs/redditapify/client.go index 16e5190..79f2343 100644 --- a/internal/jobs/redditapify/client.go +++ b/internal/jobs/redditapify/client.go @@ -7,7 +7,7 @@ import ( "github.com/sirupsen/logrus" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/reddit/search" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -25,7 +25,7 @@ type CommonArgs struct { MaxUsers uint } -func (ca *CommonArgs) CopyFromArgs(a *args.RedditArguments) { +func (ca *CommonArgs) CopyFromArgs(a *search.Arguments) { ca.Sort = a.Sort ca.IncludeNSFW = a.IncludeNSFW ca.MaxItems = a.MaxItems @@ -50,7 +50,7 @@ func (args *CommonArgs) ToActorRequest() RedditActorRequest { // RedditActorRequest represents the query parameters for the Apify Reddit Scraper actor. // Based on the input schema of https://apify.com/trudax/reddit-scraper type RedditActorRequest struct { - Type types.RedditQueryType `json:"type,omitempty"` + Type types.Capability `json:"type,omitempty"` Searches []string `json:"searches,omitempty"` StartUrls []types.RedditStartURL `json:"startUrls,omitempty"` Sort types.RedditSortType `json:"sort,omitempty"` diff --git a/internal/jobs/redditapify/client_test.go b/internal/jobs/redditapify/client_test.go index d75a804..3a3d59a 100644 --- a/internal/jobs/redditapify/client_test.go +++ b/internal/jobs/redditapify/client_test.go @@ -8,7 +8,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/reddit/search" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/redditapify" @@ -97,7 +97,7 @@ var _ = Describe("RedditApifyClient", func() { Expect(req.Searches).To(Equal(queries)) Expect(req.StartUrls).To(BeNil()) Expect(*req.PostDateLimit).To(BeTemporally("~", after, time.Second)) - Expect(req.Type).To(Equal(types.RedditQueryType("posts"))) + Expect(req.Type).To(Equal(types.CapSearchPosts)) Expect(req.SearchPosts).To(BeTrue()) Expect(req.SkipComments).To(BeFalse()) Expect(req.MaxComments).To(Equal(uint(5))) @@ -119,7 +119,7 @@ var _ = Describe("RedditApifyClient", func() { req := input.(redditapify.RedditActorRequest) Expect(req.Searches).To(Equal(queries)) Expect(req.StartUrls).To(BeNil()) - Expect(req.Type).To(Equal(types.RedditQueryType("communities"))) + Expect(req.Type).To(Equal(types.CapSearchCommunities)) Expect(req.SearchCommunities).To(BeTrue()) return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil } @@ -139,7 +139,7 @@ var _ = Describe("RedditApifyClient", func() { req := input.(redditapify.RedditActorRequest) Expect(req.Searches).To(Equal(queries)) Expect(req.StartUrls).To(BeNil()) - Expect(req.Type).To(Equal(types.RedditQueryType("users"))) + Expect(req.Type).To(Equal(types.CapSearchUsers)) Expect(req.SearchUsers).To(BeTrue()) Expect(req.SkipUserPosts).To(BeTrue()) return &client.DatasetResponse{Data: client.ApifyDatasetData{Items: []json.RawMessage{}}}, "next", nil @@ -200,7 +200,7 @@ var _ = Describe("RedditApifyClient", func() { Describe("CommonArgs", func() { It("should copy from RedditArguments correctly", func() { - redditArgs := &args.RedditArguments{ + redditArgs := &search.Arguments{ Sort: types.RedditSortTop, IncludeNSFW: true, MaxItems: 1, diff --git a/internal/jobs/telemetry.go b/internal/jobs/telemetry.go index 0ad9425..db92afa 100644 --- a/internal/jobs/telemetry.go +++ b/internal/jobs/telemetry.go @@ -15,13 +15,6 @@ func NewTelemetryJob(jc config.JobConfiguration, c *stats.StatsCollector) Teleme return TelemetryJob{collector: c} } -// GetStructuredCapabilities returns the structured capabilities supported by the telemetry job -func (t TelemetryJob) GetStructuredCapabilities() types.WorkerCapabilities { - return types.WorkerCapabilities{ - types.TelemetryJob: types.AlwaysAvailableTelemetryCaps, - } -} - func (t TelemetryJob) ExecuteJob(j types.Job) (types.JobResult, error) { logrus.Debug("Executing telemetry job") diff --git a/internal/jobs/telemetry_test.go b/internal/jobs/telemetry_test.go index f869597..2a8960a 100644 --- a/internal/jobs/telemetry_test.go +++ b/internal/jobs/telemetry_test.go @@ -99,14 +99,7 @@ var _ = Describe("Telemetry Job", func() { logrus.WithField("error", result.Error).Info("Telemetry job handled missing stats collector correctly") }) - It("should return structured capabilities", func() { - capabilities := telemetryJob.GetStructuredCapabilities() - - Expect(capabilities).NotTo(BeEmpty()) - Expect(capabilities).To(HaveLen(1)) - Expect(capabilities[types.TelemetryJob]).To(ContainElement(types.CapTelemetry)) - - logrus.WithField("capabilities", capabilities).Info("Telemetry job capabilities verified") - }) + // Note: Capability detection is now centralized in capabilities/detector.go + // Individual scraper capability tests have been removed }) }) diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index f3fe2f2..8d5bc05 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -11,6 +11,9 @@ import ( "time" "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/tiktok/query" + "github.com/masa-finance/tee-worker/api/args/tiktok/transcription" + "github.com/masa-finance/tee-worker/api/args/tiktok/trending" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -40,18 +43,6 @@ type TikTokTranscriber struct { httpClient *http.Client } -// GetStructuredCapabilities returns the structured capabilities supported by the TikTok transcriber -func (t *TikTokTranscriber) GetStructuredCapabilities() types.WorkerCapabilities { - caps := make([]types.Capability, 0, len(types.AlwaysAvailableTiktokCaps)+len(types.TiktokSearchCaps)) - caps = append(caps, types.AlwaysAvailableTiktokCaps...) - if t.configuration.ApifyApiKey != "" { - caps = append(caps, types.TiktokSearchCaps...) - } - return types.WorkerCapabilities{ - types.TiktokJob: caps, - } -} - // NewTikTokTranscriber creates and initializes a new TikTokTranscriber. // It sets default values for the API configuration. func NewTikTokTranscriber(jc config.JobConfiguration, statsCollector *stats.StatsCollector) *TikTokTranscriber { @@ -110,11 +101,11 @@ func (ttt *TikTokTranscriber) ExecuteJob(j types.Job) (types.JobResult, error) { } // Branch by argument type (transcription vs search) - if transcriptionArgs, ok := jobArgs.(*args.TikTokTranscriptionArguments); ok { + if transcriptionArgs, ok := jobArgs.(*transcription.Arguments); ok { return ttt.executeTranscription(j, transcriptionArgs) - } else if searchByQueryArgs, ok := jobArgs.(*args.TikTokSearchByQueryArguments); ok { + } else if searchByQueryArgs, ok := jobArgs.(*query.Arguments); ok { return ttt.executeSearchByQuery(j, searchByQueryArgs) - } else if searchByTrendingArgs, ok := jobArgs.(*args.TikTokSearchByTrendingArguments); ok { + } else if searchByTrendingArgs, ok := jobArgs.(*trending.Arguments); ok { return ttt.executeSearchByTrending(j, searchByTrendingArgs) } else { return types.JobResult{Error: "invalid argument type for TikTok job"}, fmt.Errorf("invalid argument type") @@ -122,7 +113,7 @@ func (ttt *TikTokTranscriber) ExecuteJob(j types.Job) (types.JobResult, error) { } // executeTranscription calls the external transcription service and returns a normalized result -func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *args.TikTokTranscriptionArguments) (types.JobResult, error) { +func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *transcription.Arguments) (types.JobResult, error) { logrus.WithField("job_uuid", j.UUID).Info("Starting ExecuteJob for TikTok transcription") if ttt.configuration.TranscriptionEndpoint == "" { @@ -137,7 +128,7 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *args.TikTokTr } // Type assert to TikTok arguments - tiktokArgs, ok := jobArgs.(*args.TikTokTranscriptionArguments) + tiktokArgs, ok := jobArgs.(*transcription.Arguments) if !ok { return types.JobResult{Error: "invalid argument type for TikTok job"}, fmt.Errorf("invalid argument type") } @@ -293,7 +284,7 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *args.TikTokTr } // executeSearchByQuery runs the epctex/tiktok-search-scraper actor and returns results -func (ttt *TikTokTranscriber) executeSearchByQuery(j types.Job, a *args.TikTokSearchByQueryArguments) (types.JobResult, error) { +func (ttt *TikTokTranscriber) executeSearchByQuery(j types.Job, a *query.Arguments) (types.JobResult, error) { c, err := tiktokapify.NewTikTokApifyClient(ttt.configuration.ApifyApiKey) if err != nil { ttt.stats.Add(j.WorkerID, stats.TikTokAuthErrors, 1) @@ -324,7 +315,7 @@ func (ttt *TikTokTranscriber) executeSearchByQuery(j types.Job, a *args.TikTokSe } // executeSearchByTrending runs the lexis-solutions/tiktok-trending-videos-scraper actor and returns results -func (ttt *TikTokTranscriber) executeSearchByTrending(j types.Job, a *args.TikTokSearchByTrendingArguments) (types.JobResult, error) { +func (ttt *TikTokTranscriber) executeSearchByTrending(j types.Job, a *trending.Arguments) (types.JobResult, error) { c, err := tiktokapify.NewTikTokApifyClient(ttt.configuration.ApifyApiKey) if err != nil { ttt.stats.Add(j.WorkerID, stats.TikTokAuthErrors, 1) diff --git a/internal/jobs/tiktok_test.go b/internal/jobs/tiktok_test.go index b488e06..5c9b5f4 100644 --- a/internal/jobs/tiktok_test.go +++ b/internal/jobs/tiktok_test.go @@ -44,14 +44,14 @@ var _ = Describe("TikTok", func() { Context("when a valid TikTok URL is provided", func() { It("should successfully transcribe the video and record success stats", func(ctx SpecContext) { videoURL := "https://www.tiktok.com/@theblockrunner.com/video/7227579907361066282" - jobArguments := map[string]interface{}{ + JobArgument := map[string]interface{}{ "type": types.CapTranscription, "video_url": videoURL, } job := types.Job{ Type: types.TiktokJob, - Arguments: jobArguments, + Arguments: JobArgument, WorkerID: "tiktok-test-worker-happy", UUID: "test-uuid-happy", } @@ -114,14 +114,14 @@ var _ = Describe("TikTok", func() { Context("when arguments are invalid", func() { It("should return an error if VideoURL is empty and not record error stats", func() { - jobArguments := map[string]interface{}{ + JobArgument := map[string]interface{}{ "type": types.CapTranscription, "video_url": "", // Empty URL } job := types.Job{ Type: types.TiktokJob, - Arguments: jobArguments, + Arguments: JobArgument, WorkerID: "tiktok-test-worker-invalid", UUID: "test-uuid-invalid", } diff --git a/internal/jobs/tiktokapify/client.go b/internal/jobs/tiktokapify/client.go index 75139a5..48c9b07 100644 --- a/internal/jobs/tiktokapify/client.go +++ b/internal/jobs/tiktokapify/client.go @@ -4,7 +4,8 @@ import ( "encoding/json" "fmt" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/tiktok/query" + "github.com/masa-finance/tee-worker/api/args/tiktok/trending" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/pkg/client" @@ -43,7 +44,7 @@ func (c *TikTokApifyClient) ValidateApiKey() error { } // SearchByQuery runs the search actor and returns typed results -func (c *TikTokApifyClient) SearchByQuery(input args.TikTokSearchByQueryArguments, cursor client.Cursor, limit uint) ([]*types.TikTokSearchByQueryResult, client.Cursor, error) { +func (c *TikTokApifyClient) SearchByQuery(input query.Arguments, cursor client.Cursor, limit uint) ([]*types.TikTokSearchByQueryResult, client.Cursor, error) { // Map snake_case fields to Apify actor's expected camelCase input startUrls := input.StartUrls if startUrls == nil { @@ -92,7 +93,7 @@ func (c *TikTokApifyClient) SearchByQuery(input args.TikTokSearchByQueryArgument } // SearchByTrending runs the trending actor and returns typed results -func (c *TikTokApifyClient) SearchByTrending(input args.TikTokSearchByTrendingArguments, cursor client.Cursor, limit uint) ([]*types.TikTokSearchByTrending, client.Cursor, error) { +func (c *TikTokApifyClient) SearchByTrending(input trending.Arguments, cursor client.Cursor, limit uint) ([]*types.TikTokSearchByTrending, client.Cursor, error) { request := TikTokSearchByTrendingRequest{ CountryCode: input.CountryCode, SortBy: input.SortBy, diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index e3a1991..fe10bb2 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -12,6 +12,7 @@ import ( "github.com/masa-finance/tee-worker/pkg/client" "github.com/masa-finance/tee-worker/api/args" + twitterargs "github.com/masa-finance/tee-worker/api/args/twitter" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -144,7 +145,6 @@ func (ts *TwitterScraper) getApiScraper(j types.Job) (*twitterx.TwitterXScraper, // getApifyScraper returns an Apify client func (ts *TwitterScraper) getApifyScraper(j types.Job) (*twitterapify.TwitterApifyClient, error) { - // TODO: We should verify whether each of the actors is actually available through this API key if ts.configuration.ApifyApiKey == "" { ts.statsCollector.Add(j.WorkerID, stats.TwitterAuthErrors, 1) return nil, fmt.Errorf("no Apify API key available") @@ -183,79 +183,24 @@ func filterMap[T any, R any](slice []T, f func(T) (R, bool)) []R { return result } -func (ts *TwitterScraper) ScrapeFollowersForProfile(j types.Job, baseDir string, username string, count int) ([]*twitterscraper.Profile, error) { +func (ts *TwitterScraper) SearchByProfile(j types.Job, baseDir string, username string) (twitterscraper.Profile, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { - return nil, err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - followingResponse, errString, _ := scraper.FetchFollowers(username, count, "") - if errString != "" { - fetchErr := fmt.Errorf("error fetching followers: %s", errString) - if ts.handleError(j, fetchErr, account) { - return nil, fetchErr - } - logrus.Errorf("[-] Error fetching followers: %s", errString) - return nil, fetchErr - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterProfiles, uint(len(followingResponse))) - return followingResponse, nil -} - -func (ts *TwitterScraper) ScrapeTweetsProfile(j types.Job, baseDir string, username string) (twitterscraper.Profile, error) { - logrus.Infof("[ScrapeTweetsProfile] Starting profile scraping for username: %s", username) - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - logrus.Errorf("[ScrapeTweetsProfile] Failed to get credential scraper: %v", err) + logrus.Errorf("failed to get credential scraper: %v", err) return twitterscraper.Profile{}, err } - - logrus.Infof("[ScrapeTweetsProfile] About to increment TwitterScrapes stat for WorkerID: %s", j.WorkerID) ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - logrus.Infof("[ScrapeTweetsProfile] TwitterScrapes incremented, now calling scraper.GetProfile") - profile, err := scraper.GetProfile(username) if err != nil { - logrus.Errorf("[ScrapeTweetsProfile] scraper.GetProfile failed for username %s: %v", username, err) + logrus.Errorf("scraper.GetProfile failed for username %s: %v", username, err) _ = ts.handleError(j, err, account) return twitterscraper.Profile{}, err } - - logrus.Infof("[ScrapeTweetsProfile] Profile retrieved successfully for username: %s, profile: %+v", username, profile) - logrus.Infof("[ScrapeTweetsProfile] About to increment TwitterProfiles stat for WorkerID: %s", j.WorkerID) ts.statsCollector.Add(j.WorkerID, stats.TwitterProfiles, 1) - logrus.Infof("[ScrapeTweetsProfile] TwitterProfiles incremented successfully") - return profile, nil } -func (ts *TwitterScraper) ScrapeTweetsByFullArchiveSearchQuery(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { - return ts.queryTweets(j, twitterx.TweetsAll, baseDir, query, count) -} - -func (ts *TwitterScraper) ScrapeTweetsByRecentSearchQuery(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { - return ts.queryTweets(j, twitterx.TweetsSearchRecent, baseDir, query, count) -} - -func (ts *TwitterScraper) queryTweets(j types.Job, baseQueryEndpoint string, baseDir string, query string, count int) ([]*types.TweetResult, error) { - // Try credentials first, fallback to API for CapSearchByQuery - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err == nil { - return ts.scrapeTweetsWithCredentials(j, query, count, scraper, account) - } - - // Fallback to API - twitterXScraper, apiKey, apiErr := ts.getApiScraper(j) - if apiErr != nil { - ts.statsCollector.Add(j.WorkerID, stats.TwitterAuthErrors, 1) - return nil, fmt.Errorf("no Twitter accounts or API keys available") - } - return ts.scrapeTweets(j, baseQueryEndpoint, query, count, twitterXScraper, apiKey) -} - -func (ts *TwitterScraper) queryTweetsWithCredentials(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { +func (ts *TwitterScraper) SearchByQuery(j types.Job, baseDir string, query string, count int) ([]*types.TweetResult, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { return nil, err @@ -263,12 +208,12 @@ func (ts *TwitterScraper) queryTweetsWithCredentials(j types.Job, baseDir string return ts.scrapeTweetsWithCredentials(j, query, count, scraper, account) } -func (ts *TwitterScraper) queryTweetsWithApiKey(j types.Job, baseQueryEndpoint string, query string, count int) ([]*types.TweetResult, error) { +func (ts *TwitterScraper) SearchByFullArchive(j types.Job, baseQueryEndpoint string, query string, count int) ([]*types.TweetResult, error) { twitterXScraper, apiKey, err := ts.getApiScraper(j) if err != nil { return nil, err } - return ts.scrapeTweets(j, baseQueryEndpoint, query, count, twitterXScraper, apiKey) + return ts.scrapeTweetsWithAPI(j, baseQueryEndpoint, query, count, twitterXScraper, apiKey) } func (ts *TwitterScraper) scrapeTweetsWithCredentials(j types.Job, query string, count int, scraper *twitter.Scraper, account *twitter.TwitterAccount) ([]*types.TweetResult, error) { @@ -293,8 +238,7 @@ func (ts *TwitterScraper) scrapeTweetsWithCredentials(j types.Job, query string, return tweets, nil } -// scrapeTweets uses an existing scraper instance -func (ts *TwitterScraper) scrapeTweets(j types.Job, baseQueryEndpoint string, query string, count int, twitterXScraper *twitterx.TwitterXScraper, apiKey *twitter.TwitterApiKey) ([]*types.TweetResult, error) { +func (ts *TwitterScraper) scrapeTweetsWithAPI(j types.Job, baseQueryEndpoint string, query string, count int, twitterXScraper *twitterx.TwitterXScraper, apiKey *twitter.TwitterApiKey) ([]*types.TweetResult, error) { ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) if baseQueryEndpoint == twitterx.TweetsAll && apiKey.Type == twitter.TwitterApiKeyTypeBase { @@ -391,28 +335,6 @@ EndLoop: return tweets, nil } -func (ts *TwitterScraper) ScrapeTweetByID(j types.Job, baseDir string, tweetID string) (*types.TweetResult, error) { - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, err - } - - tweet, err := scraper.GetTweet(tweetID) - if err != nil { - _ = ts.handleError(j, err, account) - return nil, err - } - if tweet == nil { - return nil, fmt.Errorf("tweet not found or error occurred, but error was nil") - } - - tweetResult := ts.convertTwitterScraperTweetToTweetResult(*tweet) - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, 1) - return tweetResult, nil -} - func (ts *TwitterScraper) GetTweet(j types.Job, baseDir, tweetID string) (*types.TweetResult, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { @@ -583,137 +505,6 @@ func (ts *TwitterScraper) GetUserMedia(j types.Job, baseDir, username string, co return media, nextCursor, nil } -func (ts *TwitterScraper) GetHomeTweets(j types.Job, baseDir string, count int, cursor string) ([]*types.TweetResult, string, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, "", err - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - - var tweets []*types.TweetResult - var nextCursor string - - if cursor != "" { - fetchedTweets, fetchCursor, fetchErr := scraper.FetchHomeTweets(count, cursor) - if fetchErr != nil { - _ = ts.handleError(j, fetchErr, account) - return nil, "", fetchErr - } - for _, tweet := range fetchedTweets { - newTweetResult := ts.convertTwitterScraperTweetToTweetResult(*tweet) - tweets = append(tweets, newTweetResult) - } - nextCursor = fetchCursor - } else { - ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) - defer cancel() - for tweetScraped := range scraper.GetHomeTweets(ctx, count) { - if tweetScraped.Error != nil { - _ = ts.handleError(j, tweetScraped.Error, account) - return nil, "", tweetScraped.Error - } - newTweetResult := ts.convertTwitterScraperTweetToTweetResult(tweetScraped.Tweet) - tweets = append(tweets, newTweetResult) - if len(tweets) >= count && count > 0 { - break - } - } - if len(tweets) > 0 { - nextCursor = strconv.FormatInt(tweets[len(tweets)-1].ID, 10) - } - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, uint(len(tweets))) - return tweets, nextCursor, nil -} - -func (ts *TwitterScraper) GetForYouTweets(j types.Job, baseDir string, count int, cursor string) ([]*types.TweetResult, string, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, "", err - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - - var tweets []*types.TweetResult - var nextCursor string - - if cursor != "" { - fetchedTweets, fetchCursor, fetchErr := scraper.FetchForYouTweets(count, cursor) - if fetchErr != nil { - _ = ts.handleError(j, fetchErr, account) - return nil, "", fetchErr - } - for _, tweet := range fetchedTweets { - newTweetResult := ts.convertTwitterScraperTweetToTweetResult(*tweet) - tweets = append(tweets, newTweetResult) - } - nextCursor = fetchCursor - } else { - ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) - defer cancel() - for tweetScraped := range scraper.GetForYouTweets(ctx, count) { - if tweetScraped.Error != nil { - _ = ts.handleError(j, tweetScraped.Error, account) - return nil, "", tweetScraped.Error - } - newTweetResult := ts.convertTwitterScraperTweetToTweetResult(tweetScraped.Tweet) - tweets = append(tweets, newTweetResult) - if len(tweets) >= count && count > 0 { - break - } - } - if len(tweets) > 0 { - nextCursor = strconv.FormatInt(tweets[len(tweets)-1].ID, 10) - } - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, uint(len(tweets))) - return tweets, nextCursor, nil -} - -func (ts *TwitterScraper) GetBookmarks(j types.Job, baseDir string, count int, cursor string) ([]*types.TweetResult, string, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, "", err - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var bookmarks []*types.TweetResult - - ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) - defer cancel() - cursorInt := 0 - if cursor != "" { - var parseErr error - cursorInt, parseErr = strconv.Atoi(cursor) - if parseErr != nil { - logrus.Warnf("Invalid cursor value for GetBookmarks '%s', using default 0: %v", cursor, parseErr) - cursorInt = 0 // Ensure it's reset if parse fails - } - } - for tweetScraped := range scraper.GetBookmarks(ctx, cursorInt) { - if tweetScraped.Error != nil { - _ = ts.handleError(j, tweetScraped.Error, account) - return nil, "", tweetScraped.Error - } - newTweetResult := ts.convertTwitterScraperTweetToTweetResult(tweetScraped.Tweet) - bookmarks = append(bookmarks, newTweetResult) - if len(bookmarks) >= count && count > 0 { - break - } - } - - var nextCursor string - if len(bookmarks) > 0 { - // The twitterscraper GetBookmarks cursor is an offset. - // The next cursor should be the current offset + number of items fetched in this batch. - nextCursor = strconv.Itoa(cursorInt + len(bookmarks)) - } else if cursor != "" { - // If no bookmarks were fetched but a cursor was provided, retain it or signal no change - nextCursor = cursor - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, uint(len(bookmarks))) - return bookmarks, nextCursor, nil -} - func (ts *TwitterScraper) GetProfileByID(j types.Job, baseDir, userID string) (*twitterscraper.Profile, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { @@ -730,98 +521,6 @@ func (ts *TwitterScraper) GetProfileByID(j types.Job, baseDir, userID string) (* return &profile, nil } -// GetProfileByIDWithApiKey fetches user profile using Twitter API key -func (ts *TwitterScraper) GetProfileByIDWithApiKey(j types.Job, userID string, apiKey *twitter.TwitterApiKey) (*twitterx.TwitterXProfileResponse, error) { - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - - apiClient := client.NewTwitterXClient(apiKey.Key) - twitterXScraper := twitterx.NewTwitterXScraper(apiClient) - - profile, err := twitterXScraper.GetProfileByID(userID) - if err != nil { - if ts.handleError(j, err, nil) { - return nil, err - } - return nil, err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterProfiles, 1) - return profile, nil -} - -// GetTweetByIDWithApiKey fetches a tweet using Twitter API key -func (ts *TwitterScraper) GetTweetByIDWithApiKey(j types.Job, tweetID string, apiKey *twitter.TwitterApiKey) (*types.TweetResult, error) { - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - - apiClient := client.NewTwitterXClient(apiKey.Key) - twitterXScraper := twitterx.NewTwitterXScraper(apiClient) - - tweetData, err := twitterXScraper.GetTweetByID(tweetID) - if err != nil { - if ts.handleError(j, err, nil) { - return nil, err - } - return nil, err - } - - // Convert TwitterXTweetData to TweetResult - tweetIDInt, convErr := strconv.ParseInt(tweetData.ID, 10, 64) - if convErr != nil { - logrus.Errorf("Failed to convert tweet ID '%s' to int64: %v", tweetData.ID, convErr) - return nil, fmt.Errorf("failed to parse tweet ID '%s': %w", tweetData.ID, convErr) - } - - // Parse the created_at time string - createdAt, timeErr := time.Parse(time.RFC3339, tweetData.CreatedAt) - if timeErr != nil { - logrus.Warnf("Failed to parse created_at time '%s': %v", tweetData.CreatedAt, timeErr) - createdAt = time.Now() // fallback to current time - } - - tweetResult := &types.TweetResult{ - ID: tweetIDInt, - TweetID: tweetData.ID, - AuthorID: tweetData.AuthorID, - Text: tweetData.Text, - ConversationID: tweetData.ConversationID, - UserID: tweetData.AuthorID, - CreatedAt: createdAt, - Username: tweetData.Username, - Lang: tweetData.Lang, - PublicMetrics: types.PublicMetrics{ - RetweetCount: tweetData.PublicMetrics.RetweetCount, - ReplyCount: tweetData.PublicMetrics.ReplyCount, - LikeCount: tweetData.PublicMetrics.LikeCount, - QuoteCount: tweetData.PublicMetrics.QuoteCount, - BookmarkCount: tweetData.PublicMetrics.BookmarkCount, - }, - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, 1) - return tweetResult, nil -} - -func (ts *TwitterScraper) SearchProfile(j types.Job, query string, count int) ([]*twitterscraper.ProfileResult, error) { - scraper, _, err := ts.getCredentialScraper(j, ts.configuration.DataDir) - if err != nil { - return nil, err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - var profiles []*twitterscraper.ProfileResult - ctx, cancel := context.WithTimeout(context.Background(), j.Timeout) - defer cancel() - - for profile := range scraper.SearchProfiles(ctx, query, count) { - profiles = append(profiles, profile) - if len(profiles) >= count && count > 0 { - break - } - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterProfiles, uint(len(profiles))) - return profiles, nil -} - func (ts *TwitterScraper) GetTrends(j types.Job, baseDir string) ([]string, error) { scraper, account, err := ts.getCredentialScraper(j, baseDir) if err != nil { @@ -838,39 +537,6 @@ func (ts *TwitterScraper) GetTrends(j types.Job, baseDir string) ([]string, erro return trends, nil } -func (ts *TwitterScraper) GetFollowers(j types.Job, baseDir, user string, count int) ([]*twitterscraper.Profile, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - followers, _, fetchErr := scraper.FetchFollowers(user, count, "") - if fetchErr != nil { - _ = ts.handleError(j, fetchErr, account) - return nil, fetchErr - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterProfiles, uint(len(followers))) - return followers, nil -} - -func (ts *TwitterScraper) GetFollowing(j types.Job, baseDir, username string, count int) ([]*twitterscraper.Profile, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - following, _, fetchErr := scraper.FetchFollowing(username, count, "") - if fetchErr != nil { - _ = ts.handleError(j, fetchErr, account) // Assuming FetchFollowing returns error, not errString - return nil, fetchErr - } - ts.statsCollector.Add(j.WorkerID, stats.TwitterProfiles, uint(len(following))) - return following, nil -} - -// getFollowersApify retrieves followers using Apify func (ts *TwitterScraper) getFollowersApify(j types.Job, username string, maxResults uint, cursor client.Cursor) ([]*types.ProfileResultApify, client.Cursor, error) { apifyScraper, err := ts.getApifyScraper(j) if err != nil { @@ -888,7 +554,6 @@ func (ts *TwitterScraper) getFollowersApify(j types.Job, username string, maxRes return followers, nextCursor, nil } -// getFollowingApify retrieves following using Apify func (ts *TwitterScraper) getFollowingApify(j types.Job, username string, maxResults uint, cursor client.Cursor) ([]*types.ProfileResultApify, client.Cursor, error) { apifyScraper, err := ts.getApifyScraper(j) if err != nil { @@ -922,45 +587,6 @@ func (ts *TwitterScraper) GetSpace(j types.Job, baseDir, spaceID string) (*twitt return space, nil } -func (ts *TwitterScraper) FetchHomeTweets(j types.Job, baseDir string, count int, cursor string) ([]*twitterscraper.Tweet, string, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, "", err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - tweets, nextCursor, fetchErr := scraper.FetchHomeTweets(count, cursor) - if fetchErr != nil { - _ = ts.handleError(j, fetchErr, account) - return nil, "", fetchErr - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, uint(len(tweets))) - return tweets, nextCursor, nil -} - -func (ts *TwitterScraper) FetchForYouTweets(j types.Job, baseDir string, count int, cursor string) ([]*twitterscraper.Tweet, string, error) { - scraper, account, err := ts.getCredentialScraper(j, baseDir) - if err != nil { - return nil, "", err - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterScrapes, 1) - tweets, nextCursor, fetchErr := scraper.FetchForYouTweets(count, cursor) - if fetchErr != nil { - _ = ts.handleError(j, fetchErr, account) - return nil, "", fetchErr - } - - ts.statsCollector.Add(j.WorkerID, stats.TwitterTweets, uint(len(tweets))) - return tweets, nextCursor, nil -} - -// TwitterScraperConfig is now defined in api/types to avoid duplication and circular imports - -// twitterScraperRuntimeConfig holds the runtime configuration without JSON tags to prevent credential serialization -// Unified config: use types.TwitterScraperConfig directly - type TwitterScraper struct { configuration config.TwitterScraperConfig accountManager *twitter.TwitterAccountManager @@ -984,346 +610,104 @@ func NewTwitterScraper(jc config.JobConfiguration, c *stats.StatsCollector) *Twi accountManager: accountManager, statsCollector: c, capabilities: map[types.Capability]bool{ - types.CapSearchByQuery: true, + // Credential-based capabilities + types.CapSearchByQuery: true, + types.CapSearchByProfile: true, + types.CapGetById: true, + types.CapGetReplies: true, + types.CapGetTweets: true, + types.CapGetMedia: true, + types.CapGetProfileById: true, + types.CapGetTrends: true, + types.CapGetSpace: true, + types.CapGetProfile: true, + + // API-based capabilities types.CapSearchByFullArchive: true, - types.CapSearchByProfile: true, - types.CapGetById: true, - types.CapGetReplies: true, - types.CapGetRetweeters: true, - types.CapGetTweets: true, - types.CapGetMedia: true, - types.CapGetHomeTweets: true, - types.CapGetForYouTweets: true, - types.CapGetProfileById: true, - types.CapGetTrends: true, - types.CapGetFollowing: true, - types.CapGetFollowers: true, - types.CapGetSpace: true, - }, - } -} - -// GetStructuredCapabilities returns the structured capabilities supported by this Twitter scraper -// based on the available credentials and API keys -func (ts *TwitterScraper) GetStructuredCapabilities() types.WorkerCapabilities { - capabilities := make(types.WorkerCapabilities) - - // Check if we have Twitter accounts for credential-based scraping - if len(ts.configuration.Accounts) > 0 { - var credCaps []types.Capability - for capability, enabled := range ts.capabilities { - if enabled { - credCaps = append(credCaps, capability) - } - } - if len(credCaps) > 0 { - capabilities[types.TwitterCredentialJob] = credCaps - } - } - - // Check if we have API keys for API-based scraping - if len(ts.configuration.ApiKeys) > 0 { - apiCaps := make([]types.Capability, len(types.TwitterAPICaps)) - copy(apiCaps, types.TwitterAPICaps) - - // Check for elevated API capabilities - if ts.accountManager != nil { - for _, apiKey := range ts.accountManager.GetApiKeys() { - if apiKey.Type == twitter.TwitterApiKeyTypeElevated { - apiCaps = append(apiCaps, types.CapSearchByFullArchive) - break - } - } - } - - capabilities[types.TwitterApiJob] = apiCaps - } - - // Add Apify-specific capabilities based on available API key - // TODO: We should verify whether each of the actors is actually available through this API key - if ts.configuration.ApifyApiKey != "" { - capabilities[types.TwitterApifyJob] = types.TwitterApifyCaps - } - // Add general twitter scraper capability (uses best available method) - if len(ts.configuration.Accounts) > 0 || len(ts.configuration.ApiKeys) > 0 { - var generalCaps []types.Capability - if len(ts.configuration.Accounts) > 0 { - // Use all capabilities if we have accounts - for capability, enabled := range ts.capabilities { - if enabled { - generalCaps = append(generalCaps, capability) - } - } - } else { - // Use API capabilities if we only have keys - generalCaps = make([]types.Capability, len(types.TwitterAPICaps)) - copy(generalCaps, types.TwitterAPICaps) - // Check for elevated capabilities - if ts.accountManager != nil { - for _, apiKey := range ts.accountManager.GetApiKeys() { - if apiKey.Type == twitter.TwitterApiKeyTypeElevated { - generalCaps = append(generalCaps, types.CapSearchByFullArchive) - break - } - } - } - } - - capabilities[types.TwitterJob] = generalCaps - } - - return capabilities -} - -type TwitterScrapeStrategy interface { - Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) -} - -func getScrapeStrategy(jobType types.JobType) TwitterScrapeStrategy { - switch jobType { - case types.TwitterCredentialJob: - return &CredentialScrapeStrategy{} - case types.TwitterApiJob: - return &ApiKeyScrapeStrategy{} - case types.TwitterApifyJob: - return &ApifyScrapeStrategy{} - default: - return &DefaultScrapeStrategy{} - } -} - -type CredentialScrapeStrategy struct{} - -func (s *CredentialScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { - capability := jobArgs.GetCapability() - switch capability { - case types.CapSearchByQuery: - tweets, err := ts.queryTweetsWithCredentials(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) - return processResponse(tweets, "", err) - case types.CapSearchByFullArchive: - logrus.Warn("Full archive search with credential-only implementation may have limited results") - tweets, err := ts.queryTweetsWithCredentials(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) - return processResponse(tweets, "", err) - default: - return defaultStrategyFallback(j, ts, jobArgs) + // Apify-based capabilities + types.CapGetFollowing: true, + types.CapGetFollowers: true, + }, } } -type ApiKeyScrapeStrategy struct{} - -func (s *ApiKeyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { +// executeCapability routes the job to the appropriate method based on capability +func (ts *TwitterScraper) executeCapability(j types.Job, jobArgs *twitterargs.Search) (types.JobResult, error) { capability := jobArgs.GetCapability() - switch capability { - case types.CapSearchByQuery: - tweets, err := ts.queryTweetsWithApiKey(j, twitterx.TweetsSearchRecent, jobArgs.Query, jobArgs.MaxResults) - return processResponse(tweets, "", err) - case types.CapSearchByFullArchive: - tweets, err := ts.queryTweetsWithApiKey(j, twitterx.TweetsAll, jobArgs.Query, jobArgs.MaxResults) - return processResponse(tweets, "", err) - case types.CapGetProfileById: - _, apiKey, err := ts.getApiScraper(j) - if err != nil { - return types.JobResult{Error: err.Error()}, err - } - profile, err := ts.GetProfileByIDWithApiKey(j, jobArgs.Query, apiKey) - return processResponse(profile, "", err) - case types.CapGetById: - _, apiKey, err := ts.getApiScraper(j) - if err != nil { - return types.JobResult{Error: err.Error()}, err - } - tweet, err := ts.GetTweetByIDWithApiKey(j, jobArgs.Query, apiKey) - return processResponse(tweet, "", err) - default: - return defaultStrategyFallback(j, ts, jobArgs) - } -} - -type ApifyScrapeStrategy struct{} -func (s *ApifyScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { - capability := types.Capability(jobArgs.QueryType) switch capability { + // Apify-based capabilities case types.CapGetFollowers: followers, nextCursor, err := ts.getFollowersApify(j, jobArgs.Query, uint(jobArgs.MaxResults), client.Cursor(jobArgs.NextCursor)) return processResponse(followers, nextCursor.String(), err) case types.CapGetFollowing: following, nextCursor, err := ts.getFollowingApify(j, jobArgs.Query, uint(jobArgs.MaxResults), client.Cursor(jobArgs.NextCursor)) return processResponse(following, nextCursor.String(), err) - default: - return types.JobResult{Error: fmt.Sprintf("unsupported capability %s for Apify job", capability)}, fmt.Errorf("unsupported capability %s for Apify job", capability) - } -} -type DefaultScrapeStrategy struct{} - -// FIXED: Now using validated QueryType from centralized unmarshaller (addresses the TODO comment) -func (s *DefaultScrapeStrategy) Execute(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { - capability := types.Capability(jobArgs.QueryType) - switch capability { - case types.CapGetFollowers, types.CapGetFollowing: - // Priority: Apify > Credentials for general TwitterJob - // TODO: We should verify whether each of the actors is actually available through this API key - if ts.configuration.ApifyApiKey != "" { - // Use Apify strategy - apifyStrategy := &ApifyScrapeStrategy{} - return apifyStrategy.Execute(j, ts, jobArgs) - } - // Fall back to credential-based strategy - credentialStrategy := &CredentialScrapeStrategy{} - return credentialStrategy.Execute(j, ts, jobArgs) - case types.CapSearchByQuery: - // Priority: Credentials > API for searchbyquery - if len(ts.configuration.Accounts) > 0 { - credentialStrategy := &CredentialScrapeStrategy{} - return credentialStrategy.Execute(j, ts, jobArgs) - } - // Fall back to API strategy - tweets, err := ts.queryTweets(j, twitterx.TweetsSearchRecent, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) - return processResponse(tweets, "", err) + // API-based capabilities case types.CapSearchByFullArchive: - tweets, err := ts.queryTweets(j, twitterx.TweetsAll, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) + tweets, err := ts.SearchByFullArchive(j, twitterx.TweetsAll, jobArgs.Query, jobArgs.MaxResults) return processResponse(tweets, "", err) - default: - return defaultStrategyFallback(j, ts, jobArgs) - } -} -func retryWithCursor[T any]( - j types.Job, - baseDir string, - count int, - cursor string, - fn func(j types.Job, baseDir string, currentCount int, currentCursor string) ([]*T, string, error), -) (types.JobResult, error) { - records := make([]*T, 0, count) - deadline := time.Now().Add(j.Timeout) - currentCursor := cursor // Use 'currentCursor' to manage pagination state within the loop - - for (len(records) < count || count == 0) && time.Now().Before(deadline) { // Allow count == 0 to fetch all available up to timeout - numToFetch := count - len(records) - if count == 0 { // If count is 0, fetch a reasonable batch size, e.g. 100, or let fn decide - numToFetch = 100 // Or another default batch size if fn doesn't handle count=0 well for batching - } - if numToFetch <= 0 && count > 0 { - break - } - - results, nextInternalCursor, err := fn(j, baseDir, numToFetch, currentCursor) - if err != nil { - if len(records) > 0 { - logrus.Warnf("Error during paginated fetch, returning partial results. Error: %v", err) - return processResponse(records, currentCursor, nil) - } - return processResponse(nil, "", err) - } - - if len(results) > 0 { - records = append(records, results...) - } - - if nextInternalCursor == "" || nextInternalCursor == currentCursor { // No more pages or cursor stuck - currentCursor = nextInternalCursor // Update to the last known cursor - break - } - currentCursor = nextInternalCursor - if count > 0 && len(records) >= count { // Check if desired count is reached - break - } - } - return processResponse(records, currentCursor, nil) -} - -func retryWithCursorAndQuery[T any]( - j types.Job, - baseDir string, - query string, - count int, - cursor string, - fn func(j types.Job, baseDir string, currentQuery string, currentCount int, currentCursor string) ([]*T, string, error), -) (types.JobResult, error) { - return retryWithCursor( - j, - baseDir, - count, - cursor, - func(jInner types.Job, baseDirInner string, currentCountInner int, currentCursorInner string) ([]*T, string, error) { - return fn(jInner, baseDirInner, query, currentCountInner, currentCursorInner) - }, - ) -} - -func processResponse(response any, nextCursor string, err error) (types.JobResult, error) { - if err != nil { - logrus.Debugf("Processing response with error: %v, NextCursor: %s", err, nextCursor) - return types.JobResult{Error: err.Error(), NextCursor: nextCursor}, err - } - dat, marshalErr := json.Marshal(response) - if marshalErr != nil { - logrus.Errorf("Error marshalling response: %v", marshalErr) - return types.JobResult{Error: marshalErr.Error()}, marshalErr - } - return types.JobResult{Data: dat, NextCursor: nextCursor}, nil -} - -func defaultStrategyFallback(j types.Job, ts *TwitterScraper, jobArgs *args.TwitterSearchArguments) (types.JobResult, error) { - capability := jobArgs.GetCapability() - switch capability { + // Credential-based capabilities + case types.CapSearchByQuery: + tweets, err := ts.SearchByQuery(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) + return processResponse(tweets, "", err) case types.CapSearchByProfile: - profile, err := ts.ScrapeTweetsProfile(j, ts.configuration.DataDir, jobArgs.Query) + profile, err := ts.SearchByProfile(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(profile, "", err) case types.CapGetById: tweet, err := ts.GetTweet(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(tweet, "", err) case types.CapGetReplies: - // GetTweetReplies takes a cursor for a specific part of a thread, not general pagination of all replies. - // The retryWithCursor logic might not directly apply unless GetTweetReplies is adapted for broader pagination. replies, err := ts.GetTweetReplies(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.NextCursor) - return processResponse(replies, jobArgs.NextCursor, err) // Pass original NextCursor as it's specific + return processResponse(replies, jobArgs.NextCursor, err) case types.CapGetRetweeters: - // Similar to GetTweetReplies, cursor is for a specific page. retweeters, err := ts.GetTweetRetweeters(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor) - // GetTweetRetweeters in twitterscraper returns (profiles, nextCursorStr, error) - // The current ts.GetTweetRetweeters doesn't return the next cursor. This should be updated if pagination is needed here. - // For now, assuming it fetches one batch or handles its own pagination internally up to MaxResults. - return processResponse(retweeters, "", err) // Assuming no next cursor from this specific call structure - case types.CapGetTweets: - return retryWithCursorAndQuery(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetUserTweets) + return processResponse(retweeters, jobArgs.NextCursor, err) case types.CapGetMedia: - return retryWithCursorAndQuery(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetUserMedia) - case types.CapGetHomeTweets: - return retryWithCursor(j, ts.configuration.DataDir, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetHomeTweets) - case types.CapGetForYouTweets: - return retryWithCursor(j, ts.configuration.DataDir, jobArgs.MaxResults, jobArgs.NextCursor, ts.GetForYouTweets) + media, nextCursor, err := ts.GetUserMedia(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor) + return processResponse(media, nextCursor, err) case types.CapGetProfileById: profile, err := ts.GetProfileByID(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(profile, "", err) case types.CapGetTrends: trends, err := ts.GetTrends(j, ts.configuration.DataDir) return processResponse(trends, "", err) - case types.CapGetFollowing: - following, err := ts.GetFollowing(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) - return processResponse(following, "", err) - case types.CapGetFollowers: - followers, err := ts.GetFollowers(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults) - return processResponse(followers, "", err) case types.CapGetSpace: space, err := ts.GetSpace(j, ts.configuration.DataDir, jobArgs.Query) return processResponse(space, "", err) + case types.CapGetProfile: + profile, err := ts.SearchByProfile(j, ts.configuration.DataDir, jobArgs.Query) + return processResponse(profile, "", err) + case types.CapGetTweets: + tweets, nextCursor, err := ts.GetUserTweets(j, ts.configuration.DataDir, jobArgs.Query, jobArgs.MaxResults, jobArgs.NextCursor) + return processResponse(tweets, nextCursor, err) + + default: + return types.JobResult{Error: fmt.Sprintf("unsupported capability: %s", capability)}, fmt.Errorf("unsupported capability: %s", capability) } - return types.JobResult{Error: "invalid search type in defaultStrategyFallback: " + string(jobArgs.QueryType)}, fmt.Errorf("invalid search type: %s", jobArgs.QueryType) } -// ExecuteJob runs a job using the appropriate scrape strategy based on the job type. +func processResponse(response any, nextCursor string, err error) (types.JobResult, error) { + if err != nil { + logrus.Debugf("Processing response with error: %v, NextCursor: %s", err, nextCursor) + return types.JobResult{Error: err.Error(), NextCursor: nextCursor}, err + } + dat, marshalErr := json.Marshal(response) + if marshalErr != nil { + logrus.Errorf("Error marshalling response: %v", marshalErr) + return types.JobResult{Error: marshalErr.Error()}, marshalErr + } + return types.JobResult{Data: dat, NextCursor: nextCursor}, nil +} + +// ExecuteJob runs a Twitter job using capability-based routing. // It first unmarshals the job arguments using the centralized type-safe unmarshaller. -// Then it runs the appropriate scrape strategy's Execute method, passing in the job, TwitterScraper, and job arguments. -// If the result is empty, it returns an error. -// If the result is not empty, it unmarshals the result into a slice of TweetResult and returns the result. -// If the unmarshaling fails, it returns an error. -// If the unmarshaled result is empty, it returns an error. +// Then it routes to the appropriate method based on the capability. func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { - // Use the centralized unmarshaller from tee-types - this addresses the TODO comment! + // Use the centralized unmarshaller from tee-types jobArgs, err := args.UnmarshalJobArguments(types.JobType(j.Type), map[string]any(j.Arguments)) if err != nil { logrus.Errorf("Error while unmarshalling job arguments for job ID %s, type %s: %v", j.UUID, j.Type, err) @@ -1331,7 +715,7 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } // Type assert to Twitter arguments - args, ok := jobArgs.(*args.TwitterSearchArguments) + args, ok := jobArgs.(*twitterargs.Search) if !ok { logrus.Errorf("Expected Twitter arguments for job ID %s, type %s", j.UUID, j.Type) return types.JobResult{Error: "invalid argument type for Twitter job"}, fmt.Errorf("invalid argument type") @@ -1340,9 +724,8 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { // Log the capability for debugging logrus.Debugf("Executing Twitter job ID %s with capability: %s", j.UUID, args.GetCapability()) - strategy := getScrapeStrategy(j.Type) - - jobResult, err := strategy.Execute(j, ts, args) + // Route based on capability + jobResult, err := ts.executeCapability(j, args) if err != nil { logrus.Errorf("Error executing job ID %s, type %s: %v", j.UUID, j.Type, err) return types.JobResult{Error: "error executing job"}, err @@ -1354,6 +737,7 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: "job result data is empty"}, fmt.Errorf("job result data is empty") } + // Validate the result based on operation type switch { case args.IsSingleTweetOperation(): var result *types.TweetResult diff --git a/internal/jobs/twitter_test.go b/internal/jobs/twitter_test.go index 008c674..b5fb730 100644 --- a/internal/jobs/twitter_test.go +++ b/internal/jobs/twitter_test.go @@ -7,7 +7,6 @@ import ( "strings" "time" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/sirupsen/logrus" @@ -105,7 +104,7 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByQuery, "query": "NASA", @@ -130,7 +129,7 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByQuery, "query": "NASA", @@ -156,7 +155,7 @@ var _ = Describe("Twitter Scraper", func() { }, statsCollector) // Try to run credential-only job with only API key res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByQuery, "query": "NASA", @@ -199,7 +198,7 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByQuery, "query": "NASA", @@ -220,7 +219,7 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByFullArchive, "query": "NASA", @@ -275,7 +274,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByProfile, "query": "NASA_Marshall", @@ -325,7 +324,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetReplies, "query": "1234567890", @@ -354,7 +353,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetRetweeters, "query": "1234567890", @@ -384,7 +383,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetTweets, "query": "NASA", @@ -414,7 +413,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } res, err := twitterScraper.ExecuteJob(types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetMedia, "query": "NASA", @@ -432,72 +431,12 @@ var _ = Describe("Twitter Scraper", func() { Expect(len(media[0].Photos) + len(media[0].Videos)).ToNot(BeZero()) }) - It("should fetch home tweets", func() { - if len(twitterAccounts) == 0 { - Skip("TWITTER_ACCOUNTS is not set") - } - j := types.Job{ - Type: types.TwitterCredentialJob, - Arguments: map[string]interface{}{ - "type": types.CapGetHomeTweets, - "max_results": 5, - }, - Timeout: 10 * time.Second, - } - res, err := twitterScraper.ExecuteJob(j) - Expect(err).NotTo(HaveOccurred()) - Expect(res.Error).To(BeEmpty()) - - var tweets []*types.TweetResult - err = res.Unmarshal(&tweets) - Expect(err).NotTo(HaveOccurred()) - Expect(len(tweets)).ToNot(BeZero()) - Expect(tweets[0].Text).ToNot(BeEmpty()) - - // Wait briefly for asynchronous stats processing to complete - time.Sleep(100 * time.Millisecond) - - Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterScrapes]).To(BeNumerically("==", 1)) - Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterTweets]).To(BeNumerically("==", uint(len(tweets)))) - }) - - It("should fetch for you tweets", func() { - if len(twitterAccounts) == 0 { - Skip("TWITTER_ACCOUNTS is not set") - } - j := types.Job{ - Type: types.TwitterCredentialJob, - Arguments: map[string]interface{}{ - "type": types.CapGetForYouTweets, - "max_results": 5, - }, - Timeout: 10 * time.Second, - } - res, err := twitterScraper.ExecuteJob(j) - - Expect(err).NotTo(HaveOccurred()) - Expect(res.Error).To(BeEmpty()) - - var tweets []*types.TweetResult - err = res.Unmarshal(&tweets) - Expect(err).NotTo(HaveOccurred()) - Expect(len(tweets)).ToNot(BeZero()) - Expect(tweets).ToNot(BeEmpty()) - Expect(tweets[0].Text).ToNot(BeEmpty()) - - // Wait briefly for asynchronous stats processing to complete - time.Sleep(100 * time.Millisecond) - - Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterScrapes]).To(BeNumerically("==", 1)) - Expect(statsCollector.Stats.Stats[j.WorkerID][stats.TwitterTweets]).To(BeNumerically("==", uint(len(tweets)))) - }) - It("should fetch profile by ID", func() { if len(twitterAccounts) == 0 { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetProfileById, "query": "44196397", // Elon Musk's Twitter ID @@ -525,7 +464,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetFollowing, "query": "NASA", @@ -555,7 +494,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetFollowers, "query": "NASA", @@ -585,7 +524,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("TWITTER_ACCOUNTS is not set") } j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetTrends, }, @@ -613,7 +552,7 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetById, "query": "1881258110712492142", @@ -655,7 +594,7 @@ var _ = Describe("Twitter Scraper", func() { "data_dir": tempDir, }, statsCollector) res, err := scraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetProfileById, "query": "44196397", // Elon Musk's Twitter ID @@ -734,7 +673,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("Needs full archive key in TWITTER_API_KEYS to run") j := types.Job{ - Type: types.TwitterApiJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByFullArchive, "query": "AI", @@ -763,7 +702,7 @@ var _ = Describe("Twitter Scraper", func() { Skip("Needs full archive key (elevated) in TWITTER_API_KEYS to run") j := types.Job{ - Type: types.TwitterCredentialJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapSearchByFullArchive, "query": "#AI", @@ -798,7 +737,7 @@ var _ = Describe("Twitter Scraper", func() { }, statsCollector) j := types.Job{ - Type: types.TwitterApifyJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetFollowers, "query": "elonmusk", @@ -829,7 +768,7 @@ var _ = Describe("Twitter Scraper", func() { }, statsCollector) j := types.Job{ - Type: types.TwitterApifyJob, + Type: types.TwitterJob, Arguments: map[string]interface{}{ "type": types.CapGetFollowing, "query": "elonmusk", @@ -913,7 +852,7 @@ var _ = Describe("Twitter Scraper", func() { It("should handle invalid capability for job type", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, // API job type + Type: types.TwitterJob, // API job type Arguments: map[string]interface{}{ "type": "invalidcapability", // Invalid capability "query": "test", @@ -927,9 +866,9 @@ var _ = Describe("Twitter Scraper", func() { It("should handle capability not available for specific job type", func() { res, err := twitterScraper.ExecuteJob(types.Job{ - Type: types.TwitterApiJob, // API job type - doesn't support getfollowers + Type: types.TwitterJob, // API job type - doesn't support getfollowers Arguments: map[string]interface{}{ - "type": types.CapGetFollowers, // Valid capability but not for TwitterApiJob + "type": types.CapGetFollowers, // Valid capability but not for TwitterJob "query": "test", }, Timeout: 10 * time.Second, diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 9cbc47e..72f91a0 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -8,6 +8,9 @@ import ( "github.com/sirupsen/logrus" "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/llm" + "github.com/masa-finance/tee-worker/api/args/llm/process" + "github.com/masa-finance/tee-worker/api/args/web" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs/llmapify" @@ -20,7 +23,7 @@ import ( // WebApifyClient defines the interface for the Web Apify client to allow mocking in tests type WebApifyClient interface { - Scrape(workerID string, args args.WebArguments, cursor client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) + Scrape(workerID string, args web.Page, cursor client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) } // NewWebApifyClient is a function variable that can be replaced in tests. @@ -32,7 +35,7 @@ var NewWebApifyClient = func(apiKey string, statsCollector *stats.StatsCollector // LLMApify is the interface for the LLM processor client // Only the Process method is required for this flow type LLMApify interface { - Process(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) + Process(workerID string, args llm.Process, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) } // NewLLMApifyClient is a function variable to allow injection in tests @@ -71,7 +74,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: msg.Error()}, msg } - webArgs, ok := jobArgs.(*args.WebArguments) + webArgs, ok := jobArgs.(*web.Page) if !ok { return types.JobResult{Error: "invalid argument type for Web job"}, errors.New("invalid argument type") } @@ -97,11 +100,11 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { return types.JobResult{Error: "error creating LLM Apify client"}, fmt.Errorf("failed to create LLM Apify client: %w", err) } - llmArgs := args.LLMProcessorArguments{ + llmArgs := llm.Process{ DatasetId: datasetId, Prompt: "summarize the content of this webpage, focusing on keywords and topics: ${markdown}", - MaxTokens: args.LLMDefaultMaxTokens, - Temperature: args.LLMDefaultTemperature, + MaxTokens: process.DefaultMaxTokens, + Temperature: process.DefaultTemperature, Items: uint(len(webResp)), } llmResp, _, llmErr := llmClient.Process(j.WorkerID, llmArgs, client.EmptyCursor) @@ -131,15 +134,3 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { NextCursor: cursor.String(), }, nil } - -// GetStructuredCapabilities returns the structured capabilities supported by the Web scraper -// based on the available credentials and API keys -func (ws *WebScraper) GetStructuredCapabilities() types.WorkerCapabilities { - capabilities := make(types.WorkerCapabilities) - - if ws.configuration.ApifyApiKey != "" && ws.configuration.GeminiApiKey.IsValid() { - capabilities[types.WebJob] = types.WebCaps - } - - return capabilities -} diff --git a/internal/jobs/web_test.go b/internal/jobs/web_test.go index fc8ab79..db870ac 100644 --- a/internal/jobs/web_test.go +++ b/internal/jobs/web_test.go @@ -8,7 +8,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/llm" + "github.com/masa-finance/tee-worker/api/args/web" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" @@ -20,10 +21,10 @@ import ( // MockWebApifyClient is a mock implementation of the WebApifyClient. type MockWebApifyClient struct { - ScrapeFunc func(args args.WebArguments) ([]*types.WebScraperResult, string, client.Cursor, error) + ScrapeFunc func(args web.Page) ([]*types.WebScraperResult, string, client.Cursor, error) } -func (m *MockWebApifyClient) Scrape(_ string, args args.WebArguments, _ client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) { +func (m *MockWebApifyClient) Scrape(_ string, args web.Page, _ client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) { if m != nil && m.ScrapeFunc != nil { res, datasetId, next, err := m.ScrapeFunc(args) return res, datasetId, next, err @@ -34,10 +35,10 @@ func (m *MockWebApifyClient) Scrape(_ string, args args.WebArguments, _ client.C // MockLLMApifyClient is a mock implementation of the LLMApify interface // used to prevent external calls during unit tests. type MockLLMApifyClient struct { - ProcessFunc func(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) + ProcessFunc func(workerID string, args llm.Process, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) } -func (m *MockLLMApifyClient) Process(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { +func (m *MockLLMApifyClient) Process(workerID string, args llm.Process, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { if m != nil && m.ProcessFunc != nil { return m.ProcessFunc(workerID, args, cursor) } @@ -66,7 +67,7 @@ var _ = Describe("WebScraper", func() { scraper = jobs.NewWebScraper(cfg, statsCollector) mockClient = &MockWebApifyClient{} mockLLM = &MockLLMApifyClient{ - ProcessFunc: func(workerID string, args args.LLMProcessorArguments, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { + ProcessFunc: func(workerID string, args llm.Process, cursor client.Cursor) ([]*types.LLMProcessorResult, client.Cursor, error) { // Return a single empty summary to avoid changing expectations return []*types.LLMProcessorResult{{LLMResponse: ""}}, client.EmptyCursor, nil }, @@ -107,7 +108,7 @@ var _ = Describe("WebScraper", func() { "max_pages": 2, } - mockClient.ScrapeFunc = func(args args.WebArguments) ([]*types.WebScraperResult, string, client.Cursor, error) { + mockClient.ScrapeFunc = func(args web.Page) ([]*types.WebScraperResult, string, client.Cursor, error) { Expect(args.URL).To(Equal("https://example.com")) return []*types.WebScraperResult{{URL: "https://example.com", Markdown: "# Hello"}}, "dataset-123", client.Cursor("next-cursor"), nil } @@ -133,7 +134,7 @@ var _ = Describe("WebScraper", func() { } expectedErr := errors.New("client error") - mockClient.ScrapeFunc = func(args args.WebArguments) ([]*types.WebScraperResult, string, client.Cursor, error) { + mockClient.ScrapeFunc = func(args web.Page) ([]*types.WebScraperResult, string, client.Cursor, error) { return nil, "", client.EmptyCursor, expectedErr } @@ -228,23 +229,5 @@ var _ = Describe("WebScraper", func() { Expect(resp[i].Text).To(ContainSubstring("Bittensor")) } }) - - It("should expose capabilities only when both APIFY and GEMINI keys are present", func() { - cfg := config.JobConfiguration{ - "apify_api_key": apifyKey, - "gemini_api_key": geminiKey, - } - integrationStatsCollector := stats.StartCollector(128, cfg) - integrationScraper := jobs.NewWebScraper(cfg, integrationStatsCollector) - - caps := integrationScraper.GetStructuredCapabilities() - if apifyKey != "" && geminiKey != "" { - Expect(caps[types.WebJob]).NotTo(BeEmpty()) - } else { - // Expect no capabilities when either key is missing - _, ok := caps[types.WebJob] - Expect(ok).To(BeFalse()) - } - }) }) }) diff --git a/internal/jobs/webapify/client.go b/internal/jobs/webapify/client.go index 4d61666..729b731 100644 --- a/internal/jobs/webapify/client.go +++ b/internal/jobs/webapify/client.go @@ -4,7 +4,7 @@ import ( "encoding/json" "fmt" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/web" "github.com/masa-finance/tee-worker/api/types" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -41,12 +41,12 @@ func (c *ApifyClient) ValidateApiKey() error { return c.client.ValidateApiKey() } -func (c *ApifyClient) Scrape(workerID string, args args.WebArguments, cursor client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) { +func (c *ApifyClient) Scrape(workerID string, args web.Page, cursor client.Cursor) ([]*types.WebScraperResult, string, client.Cursor, error) { if c.statsCollector != nil { c.statsCollector.Add(workerID, stats.WebQueries, 1) } - input := args.ToWebScraperRequest() + input := args.ToScraperRequest() limit := uint(args.MaxPages) dataset, nextCursor, err := c.client.RunActorAndGetResponse(apify.ActorIds.WebScraper, input, cursor, limit) diff --git a/internal/jobs/webapify/client_test.go b/internal/jobs/webapify/client_test.go index 32d5ff0..7380c57 100644 --- a/internal/jobs/webapify/client_test.go +++ b/internal/jobs/webapify/client_test.go @@ -8,7 +8,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/web" "github.com/masa-finance/tee-worker/internal/apify" "github.com/masa-finance/tee-worker/internal/jobs/webapify" "github.com/masa-finance/tee-worker/pkg/client" @@ -65,7 +65,7 @@ var _ = Describe("WebApifyClient", func() { Describe("Scrape", func() { It("should construct the correct actor input", func() { - args := args.WebArguments{ + args := web.Page{ URL: "https://example.com", MaxDepth: 1, MaxPages: 2, @@ -87,7 +87,7 @@ var _ = Describe("WebApifyClient", func() { return nil, "", expectedErr } - args := args.WebArguments{ + args := web.Page{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, @@ -107,7 +107,7 @@ var _ = Describe("WebApifyClient", func() { return dataset, "next", nil } - args := args.WebArguments{ + args := web.Page{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, @@ -132,7 +132,7 @@ var _ = Describe("WebApifyClient", func() { return dataset, "next", nil } - args := args.WebArguments{ + args := web.Page{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, @@ -193,7 +193,7 @@ var _ = Describe("WebApifyClient", func() { realClient, err := webapify.NewClient(apifyKey, nil) Expect(err).NotTo(HaveOccurred()) - args := args.WebArguments{ + args := web.Page{ URL: "https://example.com", MaxDepth: 0, MaxPages: 1, diff --git a/internal/jobserver/jobserver.go b/internal/jobserver/jobserver.go index 17de567..3bb3a09 100644 --- a/internal/jobserver/jobserver.go +++ b/internal/jobserver/jobserver.go @@ -8,10 +8,10 @@ import ( "sync" "github.com/sirupsen/logrus" - "golang.org/x/exp/maps" "github.com/google/uuid" "github.com/masa-finance/tee-worker/api/types" + "github.com/masa-finance/tee-worker/internal/capabilities" "github.com/masa-finance/tee-worker/internal/config" "github.com/masa-finance/tee-worker/internal/jobs" "github.com/masa-finance/tee-worker/internal/jobs/stats" @@ -86,15 +86,6 @@ func NewJobServer(workers int, jc config.JobConfiguration) *JobServer { types.TwitterJob: { w: jobs.NewTwitterScraper(jc, s), }, - types.TwitterCredentialJob: { - w: jobs.NewTwitterScraper(jc, s), // Uses the same implementation as standard Twitter scraper - }, - types.TwitterApiJob: { - w: jobs.NewTwitterScraper(jc, s), // Uses the same implementation as standard Twitter scraper - }, - types.TwitterApifyJob: { - w: jobs.NewTwitterScraper(jc, s), // Register Apify job type with Twitter scraper - }, types.TiktokJob: { w: jobs.NewTikTokScraper(jc, s), }, @@ -153,31 +144,11 @@ func NewJobServer(workers int, jc config.JobConfiguration) *JobServer { return js } -// GetWorkerCapabilities returns the structured capabilities for all registered workers +// GetWorkerCapabilities returns the structured capabilities using centralized detection func (js *JobServer) GetWorkerCapabilities() types.WorkerCapabilities { - // Use a map to deduplicate capabilities by job type - jobTypeCapMap := make(map[types.JobType]map[types.Capability]struct{}) - - for _, workerEntry := range js.jobWorkers { - workerCapabilities := workerEntry.w.GetStructuredCapabilities() - for jobType, capabilities := range workerCapabilities { - if _, exists := jobTypeCapMap[jobType]; !exists { - jobTypeCapMap[jobType] = make(map[types.Capability]struct{}) - } - for _, capability := range capabilities { - jobTypeCapMap[jobType][capability] = struct{}{} - } - } - } - - // Convert to final map format - allCapabilities := make(types.WorkerCapabilities) - for jobType, capabilitySet := range jobTypeCapMap { - capabilities := maps.Keys(capabilitySet) - allCapabilities[jobType] = capabilities - } - - return allCapabilities + // Use centralized capability detection instead of aggregating from individual workers + // This ensures consistent, real capability detection across all job types + return capabilities.DetectCapabilities(js.jobConfiguration, js) } func (js *JobServer) Run(ctx context.Context) { diff --git a/internal/jobserver/worker.go b/internal/jobserver/worker.go index cca9fa4..3d19edb 100644 --- a/internal/jobserver/worker.go +++ b/internal/jobserver/worker.go @@ -25,7 +25,6 @@ func (js *JobServer) worker(c context.Context) { } type worker interface { - GetStructuredCapabilities() types.WorkerCapabilities ExecuteJob(j types.Job) (types.JobResult, error) } From 5e10b9816de10248299855aafcd0fd12920554b6 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 15 Oct 2025 20:44:34 +0200 Subject: [PATCH 119/136] fix: tests --- internal/capabilities/detector.go | 71 +++++++++++++--------- internal/capabilities/detector_test.go | 4 +- internal/jobs/linkedinapify/client_test.go | 39 ++++++------ internal/jobs/llmapify/client_test.go | 53 +++++++--------- 4 files changed, 84 insertions(+), 83 deletions(-) diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index 707f3f6..9c7da40 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -43,38 +43,51 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface hasLLMKey := geminiApiKey.IsValid() || claudeApiKey.IsValid() // Add Twitter capabilities based on available authentication - if hasAccounts || hasApiKeys { - var twitterCaps []types.Capability - - // Add credential-based capabilities if we have accounts - if hasAccounts { - twitterCaps = append(twitterCaps, - types.CapSearchByQuery, - types.CapSearchByProfile, - types.CapGetById, - types.CapGetReplies, - types.CapGetRetweeters, - types.CapGetMedia, - types.CapGetProfileById, - types.CapGetTrends, - types.CapGetSpace, - types.CapGetProfile, - types.CapGetTweets, - ) - } + var twitterCaps []types.Capability + + // Add credential-based capabilities if we have accounts + if hasAccounts { + twitterCaps = append(twitterCaps, + types.CapSearchByQuery, + types.CapSearchByProfile, + types.CapGetById, + types.CapGetReplies, + types.CapGetRetweeters, + types.CapGetMedia, + types.CapGetProfileById, + types.CapGetTrends, + types.CapGetSpace, + types.CapGetProfile, + types.CapGetTweets, + ) + } - // Add API-based capabilities if we have API keys - if hasApiKeys { - // Check for elevated API capabilities - if hasElevatedApiKey(apiKeys) { - twitterCaps = append(twitterCaps, types.CapSearchByFullArchive) - } + // Add API-based capabilities if we have API keys + if hasApiKeys { + // Add basic API capabilities for any valid API key + twitterCaps = append(twitterCaps, + types.CapSearchByQuery, + types.CapSearchByProfile, + types.CapGetById, + types.CapGetReplies, + types.CapGetRetweeters, + types.CapGetMedia, + types.CapGetProfileById, + types.CapGetTrends, + types.CapGetSpace, + types.CapGetProfile, + types.CapGetTweets, + ) + + // Check for elevated API capabilities + if hasElevatedApiKey(apiKeys) { + twitterCaps = append(twitterCaps, types.CapSearchByFullArchive) } + } - // Only add capabilities if we have any supported capabilities - if len(twitterCaps) > 0 { - capabilities[types.TwitterJob] = twitterCaps - } + // Only add capabilities if we have any supported capabilities + if len(twitterCaps) > 0 { + capabilities[types.TwitterJob] = twitterCaps } if hasApifyKey { diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index 8febcfc..c1d8785 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -107,13 +107,13 @@ var _ = Describe("DetectCapabilities", func() { config.JobConfiguration{ "twitter_accounts": []string{"user1:pass1"}, }, - []string{"telemetry", "tiktok", "twitter", "twitter-credential"}, + []string{"telemetry", "tiktok", "twitter"}, ), Entry("With Twitter API keys", config.JobConfiguration{ "twitter_api_keys": []string{"key1"}, }, - []string{"telemetry", "tiktok", "twitter", "twitter-api"}, + []string{"telemetry", "tiktok", "twitter"}, ), ) }) diff --git a/internal/jobs/linkedinapify/client_test.go b/internal/jobs/linkedinapify/client_test.go index 3b56f1b..9bf2681 100644 --- a/internal/jobs/linkedinapify/client_test.go +++ b/internal/jobs/linkedinapify/client_test.go @@ -70,10 +70,9 @@ var _ = Describe("LinkedInApifyClient", func() { Describe("SearchProfiles", func() { It("should construct the correct actor input", func() { - args := profileArgs.Arguments{ - Query: "software engineer", - MaxItems: 10, - } + args := profileArgs.NewArguments() + args.Query = "software engineer" + args.MaxItems = 10 mockClient.RunActorAndGetResponseFunc = func(actorID apify.ActorId, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { Expect(actorID).To(Equal(apify.ActorIds.LinkedInSearchProfile)) @@ -97,10 +96,9 @@ var _ = Describe("LinkedInApifyClient", func() { return nil, "", expectedErr } - args := profileArgs.Arguments{ - Query: "test query", - MaxItems: 5, - } + args := profileArgs.NewArguments() + args.Query = "test query" + args.MaxItems = 5 _, _, _, err := linkedinClient.SearchProfiles("test-worker", &args, client.EmptyCursor) Expect(err).To(MatchError(expectedErr)) }) @@ -116,10 +114,9 @@ var _ = Describe("LinkedInApifyClient", func() { return dataset, "next", nil } - args := profileArgs.Arguments{ - Query: "test query", - MaxItems: 1, - } + args := profileArgs.NewArguments() + args.Query = "test query" + args.MaxItems = 1 results, _, _, err := linkedinClient.SearchProfiles("test-worker", &args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(BeEmpty()) // The invalid item should be skipped @@ -147,10 +144,9 @@ var _ = Describe("LinkedInApifyClient", func() { return dataset, "next", nil } - args := profileArgs.Arguments{ - Query: "test query", - MaxItems: 2, - } + args := profileArgs.NewArguments() + args.Query = "test query" + args.MaxItems = 2 results, _, _, err := linkedinClient.SearchProfiles("test-worker", &args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(HaveLen(2)) @@ -206,12 +202,11 @@ var _ = Describe("LinkedInApifyClient", func() { realClient, err := linkedinapify.NewClient(apifyKey, statsCollector) Expect(err).NotTo(HaveOccurred()) - args := profileArgs.Arguments{ - Type: types.CapSearchByProfile, - Query: "software engineer", - MaxItems: 1, - ScraperMode: profile.ScraperModeShort, - } + args := profileArgs.NewArguments() + args.Type = types.CapSearchByProfile + args.Query = "software engineer" + args.MaxItems = 1 + args.ScraperMode = profile.ScraperModeShort results, datasetId, cursor, err := realClient.SearchProfiles("test-worker", &args, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) diff --git a/internal/jobs/llmapify/client_test.go b/internal/jobs/llmapify/client_test.go index 292788e..69497c7 100644 --- a/internal/jobs/llmapify/client_test.go +++ b/internal/jobs/llmapify/client_test.go @@ -67,10 +67,9 @@ var _ = Describe("LLMApifyClient", func() { Describe("Process", func() { It("should construct the correct actor input", func() { - llmArgs := process.Arguments{ - DatasetId: "test-dataset-id", - Prompt: "test-prompt", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "test-dataset-id" + llmArgs.Prompt = "test-prompt" // Marshal and unmarshal to apply defaults jsonData, err := json.Marshal(llmArgs) @@ -106,10 +105,9 @@ var _ = Describe("LLMApifyClient", func() { return nil, "", expectedErr } - llmArgs := process.Arguments{ - DatasetId: "test-dataset-id", - Prompt: "test-prompt", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "test-dataset-id" + llmArgs.Prompt = "test-prompt" _, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).To(MatchError(expectedErr)) }) @@ -125,10 +123,9 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - llmArgs := process.Arguments{ - DatasetId: "test-dataset-id", - Prompt: "test-prompt", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "test-dataset-id" + llmArgs.Prompt = "test-prompt" results, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(BeEmpty()) // The invalid item should be skipped @@ -147,10 +144,9 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - llmArgs := process.Arguments{ - DatasetId: "test-dataset-id", - Prompt: "test-prompt", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "test-dataset-id" + llmArgs.Prompt = "test-prompt" results, cursor, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(cursor).To(Equal(client.Cursor("next"))) @@ -174,10 +170,9 @@ var _ = Describe("LLMApifyClient", func() { return dataset, "next", nil } - llmArgs := process.Arguments{ - DatasetId: "test-dataset-id", - Prompt: "test-prompt", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "test-dataset-id" + llmArgs.Prompt = "test-prompt" results, _, err := llmClient.Process("test-worker", llmArgs, client.EmptyCursor) Expect(err).NotTo(HaveOccurred()) Expect(results).To(HaveLen(2)) @@ -186,12 +181,11 @@ var _ = Describe("LLMApifyClient", func() { }) It("should use custom values when provided", func() { - llmArgs := process.Arguments{ - DatasetId: "test-dataset-id", - Prompt: "test-prompt", - MaxTokens: 500, - Temperature: 0.5, - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "test-dataset-id" + llmArgs.Prompt = "test-prompt" + llmArgs.MaxTokens = 500 + llmArgs.Temperature = 0.5 mockClient.RunActorAndGetResponseFunc = func(actorID apify.ActorId, input any, cursor client.Cursor, limit uint) (*client.DatasetResponse, client.Cursor, error) { request, ok := input.(types.LLMProcessorRequest) @@ -257,10 +251,9 @@ var _ = Describe("LLMApifyClient", func() { realClient, err := llmapify.NewClient(apifyKey, config.LlmConfig{GeminiApiKey: config.LlmApiKey(geminiKey)}, nil) Expect(err).NotTo(HaveOccurred()) - llmArgs := process.Arguments{ - DatasetId: "V6tyuuZIgfiETl1cl", - Prompt: "summarize the content of this webpage ${markdown}", - } + llmArgs := process.NewArguments() + llmArgs.DatasetId = "V6tyuuZIgfiETl1cl" + llmArgs.Prompt = "summarize the content of this webpage ${markdown}" // Marshal and unmarshal to apply defaults jsonData, err := json.Marshal(llmArgs) Expect(err).ToNot(HaveOccurred()) From 248d94f5592515f06f9b50059344741e15f5780f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 15 Oct 2025 20:50:54 +0200 Subject: [PATCH 120/136] fix: linkedin test --- internal/jobs/linkedin_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/jobs/linkedin_test.go b/internal/jobs/linkedin_test.go index 91cfa6a..46d002f 100644 --- a/internal/jobs/linkedin_test.go +++ b/internal/jobs/linkedin_test.go @@ -78,9 +78,9 @@ var _ = Describe("LinkedInScraper", func() { Context("ExecuteJob", func() { It("should return an error for invalid arguments", func() { job.Arguments = map[string]any{"invalid": "args"} - result, err := scraper.ExecuteJob(job) + _, err := scraper.ExecuteJob(job) Expect(err).To(HaveOccurred()) - Expect(result.Error).To(ContainSubstring("failed to unmarshal job arguments")) + Expect(errors.Is(err, profileArgs.ErrUnmarshalling)).To(BeTrue()) }) It("should return an error when Apify API key is missing", func() { From c318467c35be47654b05e834a12a11f35eafb2ba Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 15 Oct 2025 20:54:01 +0200 Subject: [PATCH 121/136] fix: test --- internal/jobs/linkedin_test.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/internal/jobs/linkedin_test.go b/internal/jobs/linkedin_test.go index 46d002f..1068830 100644 --- a/internal/jobs/linkedin_test.go +++ b/internal/jobs/linkedin_test.go @@ -76,13 +76,6 @@ var _ = Describe("LinkedInScraper", func() { }) Context("ExecuteJob", func() { - It("should return an error for invalid arguments", func() { - job.Arguments = map[string]any{"invalid": "args"} - _, err := scraper.ExecuteJob(job) - Expect(err).To(HaveOccurred()) - Expect(errors.Is(err, profileArgs.ErrUnmarshalling)).To(BeTrue()) - }) - It("should return an error when Apify API key is missing", func() { cfg := config.JobConfiguration{} scraper = jobs.NewLinkedInScraper(cfg, statsCollector) From f78986a777e3dc90f1a82350ae6906290ea89936 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Wed, 15 Oct 2025 21:08:38 +0200 Subject: [PATCH 122/136] fix: lint errors --- internal/jobs/linkedin.go | 4 ++-- internal/jobs/reddit.go | 2 +- internal/jobs/tiktok.go | 2 +- internal/jobs/twitter.go | 2 +- internal/jobs/web.go | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/jobs/linkedin.go b/internal/jobs/linkedin.go index afe7a91..8a111eb 100644 --- a/internal/jobs/linkedin.go +++ b/internal/jobs/linkedin.go @@ -51,7 +51,7 @@ func (ls *LinkedInScraper) ExecuteJob(j types.Job) (types.JobResult, error) { // Require Apify key for LinkedIn scraping apifyApiKey := ls.configuration.GetString("apify_api_key", "") if apifyApiKey == "" { - msg := errors.New("Apify API key is required for LinkedIn job") + msg := errors.New("apify API key is required for LinkedIn job") return types.JobResult{Error: msg.Error()}, msg } @@ -83,7 +83,7 @@ func (ls *LinkedInScraper) ExecuteJob(j types.Job) (types.JobResult, error) { data, err := json.Marshal(profiles) if err != nil { - return types.JobResult{Error: fmt.Sprintf("error marshalling LinkedIn response")}, fmt.Errorf("error marshalling LinkedIn response: %w", err) + return types.JobResult{Error: "error marshalling LinkedIn response"}, fmt.Errorf("error marshalling LinkedIn response: %w", err) } return types.JobResult{ diff --git a/internal/jobs/reddit.go b/internal/jobs/reddit.go index dc215da..4404288 100644 --- a/internal/jobs/reddit.go +++ b/internal/jobs/reddit.go @@ -110,7 +110,7 @@ func processRedditResponse(j types.Job, resp []*types.RedditResponse, cursor cli data, err := json.Marshal(resp) if err != nil { - return types.JobResult{Error: fmt.Sprintf("error marshalling Reddit response")}, fmt.Errorf("error marshalling Reddit response: %w", err) + return types.JobResult{Error: "error marshalling Reddit response"}, fmt.Errorf("error marshalling Reddit response: %w", err) } return types.JobResult{ Data: data, diff --git a/internal/jobs/tiktok.go b/internal/jobs/tiktok.go index 8d5bc05..a3c8b22 100644 --- a/internal/jobs/tiktok.go +++ b/internal/jobs/tiktok.go @@ -205,7 +205,7 @@ func (ttt *TikTokTranscriber) executeTranscription(j types.Job, a *transcription // Sub-Step 3.2: Extract Transcription and Metadata if len(parsedAPIResponse.Transcripts) == 0 { - errMsg := "No transcripts found in API response" + errMsg := "no transcripts found in API response" logrus.WithField("job_uuid", j.UUID).Warn(errMsg) ttt.stats.Add(j.WorkerID, stats.TikTokTranscriptionErrors, 1) // Or a different stat for "no_transcript_found" return types.JobResult{Error: errMsg}, errors.New(errMsg) diff --git a/internal/jobs/twitter.go b/internal/jobs/twitter.go index fe10bb2..a1fd422 100644 --- a/internal/jobs/twitter.go +++ b/internal/jobs/twitter.go @@ -732,7 +732,7 @@ func (ts *TwitterScraper) ExecuteJob(j types.Job) (types.JobResult, error) { } // Check if raw data is empty - if jobResult.Data == nil || len(jobResult.Data) == 0 { + if len(jobResult.Data) == 0 { logrus.Errorf("Job result data is empty for job ID %s, type %s", j.UUID, j.Type) return types.JobResult{Error: "job result data is empty"}, fmt.Errorf("job result data is empty") } diff --git a/internal/jobs/web.go b/internal/jobs/web.go index 72f91a0..df7cf37 100644 --- a/internal/jobs/web.go +++ b/internal/jobs/web.go @@ -64,7 +64,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { // Require Gemini key for LLM processing in Web flow if !w.configuration.GeminiApiKey.IsValid() { - msg := errors.New("Gemini API key is required for Web job") + msg := errors.New("gemini API key is required for Web job") return types.JobResult{Error: msg.Error()}, msg } @@ -121,7 +121,7 @@ func (w *WebScraper) ExecuteJob(j types.Job) (types.JobResult, error) { data, err := json.Marshal(webResp) if err != nil { - return types.JobResult{Error: fmt.Sprintf("error marshalling Web response")}, fmt.Errorf("error marshalling Web response: %w", err) + return types.JobResult{Error: "error marshalling Web response"}, fmt.Errorf("error marshalling Web response: %w", err) } if w.statsCollector != nil { From 31bdf65e039b026fcdaff5f100bac8c5bfafd3f0 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Thu, 16 Oct 2025 03:46:38 +0200 Subject: [PATCH 123/136] fix: test name --- internal/jobs/linkedin_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/jobs/linkedin_test.go b/internal/jobs/linkedin_test.go index 1068830..f7136b7 100644 --- a/internal/jobs/linkedin_test.go +++ b/internal/jobs/linkedin_test.go @@ -88,7 +88,7 @@ var _ = Describe("LinkedInScraper", func() { result, err := scraper.ExecuteJob(job) Expect(err).To(HaveOccurred()) - Expect(result.Error).To(ContainSubstring("Apify API key is required for LinkedIn job")) + Expect(result.Error).To(ContainSubstring("apify API key is required for LinkedIn job")) }) It("should call SearchProfiles and return data and next cursor", func() { From a4314f120d4629d7c9584bdf13266e2b09e0a51d Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sun, 19 Oct 2025 21:52:34 +0200 Subject: [PATCH 124/136] chore: cleanup comments --- api/args/linkedin/profile/profile.go | 2 +- api/args/llm/process/process.go | 2 +- api/args/telemetry/telemetry.go | 2 +- api/args/twitter/search/search.go | 2 +- api/args/unmarshaller.go | 2 -- api/args/web/page/page.go | 2 +- 6 files changed, 5 insertions(+), 7 deletions(-) diff --git a/api/args/linkedin/profile/profile.go b/api/args/linkedin/profile/profile.go index 9d22df6..1128164 100644 --- a/api/args/linkedin/profile/profile.go +++ b/api/args/linkedin/profile/profile.go @@ -136,6 +136,6 @@ func (a *Arguments) ValidateCapability(jobType types.JobType) error { func NewArguments() Arguments { args := Arguments{} args.SetDefaultValues() - args.Validate() // This will set the default capability via ValidateCapability + args.Validate() return args } diff --git a/api/args/llm/process/process.go b/api/args/llm/process/process.go index 7d08d0d..372bced 100644 --- a/api/args/llm/process/process.go +++ b/api/args/llm/process/process.go @@ -84,7 +84,7 @@ func (l *Arguments) ValidateCapability(jobType types.JobType) error { func NewArguments() Arguments { args := Arguments{} args.SetDefaultValues() - args.Validate() // This will set the default capability via ValidateCapability + args.Validate() return args } diff --git a/api/args/telemetry/telemetry.go b/api/args/telemetry/telemetry.go index 4cc2eb6..81f8b44 100644 --- a/api/args/telemetry/telemetry.go +++ b/api/args/telemetry/telemetry.go @@ -58,6 +58,6 @@ func (t *Arguments) ValidateCapability(jobType types.JobType) error { func NewArguments() Arguments { args := Arguments{} args.SetDefaultValues() - args.Validate() // This will set the default capability via ValidateCapability + args.Validate() return args } diff --git a/api/args/twitter/search/search.go b/api/args/twitter/search/search.go index ef9643d..8310666 100644 --- a/api/args/twitter/search/search.go +++ b/api/args/twitter/search/search.go @@ -120,6 +120,6 @@ func (t *Arguments) IsTrendsOperation() bool { func NewArguments() Arguments { args := Arguments{} args.SetDefaultValues() - args.Validate() // This will set the default capability via ValidateCapability + args.Validate() return args } diff --git a/api/args/unmarshaller.go b/api/args/unmarshaller.go index a8bf65f..e77e90c 100644 --- a/api/args/unmarshaller.go +++ b/api/args/unmarshaller.go @@ -122,9 +122,7 @@ func unmarshalTelemetryArguments(args Args) (*telemetry.Telemetry, error) { } // unmarshalToStruct converts a map[string]any to a struct using JSON marshal/unmarshal -// This provides the same functionality as the existing JobArgument.Unmarshal methods func unmarshalToStruct(args Args, target any) error { - // Use JSON marshal/unmarshal for conversion - this triggers our custom UnmarshalJSON methods data, err := json.Marshal(args) if err != nil { return fmt.Errorf("%w: %w", ErrFailedToMarshal, err) diff --git a/api/args/web/page/page.go b/api/args/web/page/page.go index 8356a28..a5b8c4f 100644 --- a/api/args/web/page/page.go +++ b/api/args/web/page/page.go @@ -108,6 +108,6 @@ func (w Arguments) ToScraperRequest() types.WebScraperRequest { func NewArguments() Arguments { args := Arguments{} args.SetDefaultValues() - args.Validate() // This will set the default capability via ValidateCapability + args.Validate() return args } From fe1f66ae7510b6472d577bf7847fd8c4e157e9c6 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sun, 19 Oct 2025 22:10:24 +0200 Subject: [PATCH 125/136] fix: detection --- internal/capabilities/detector.go | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/internal/capabilities/detector.go b/internal/capabilities/detector.go index 9c7da40..03346ac 100644 --- a/internal/capabilities/detector.go +++ b/internal/capabilities/detector.go @@ -64,21 +64,6 @@ func DetectCapabilities(jc config.JobConfiguration, jobServer JobServerInterface // Add API-based capabilities if we have API keys if hasApiKeys { - // Add basic API capabilities for any valid API key - twitterCaps = append(twitterCaps, - types.CapSearchByQuery, - types.CapSearchByProfile, - types.CapGetById, - types.CapGetReplies, - types.CapGetRetweeters, - types.CapGetMedia, - types.CapGetProfileById, - types.CapGetTrends, - types.CapGetSpace, - types.CapGetProfile, - types.CapGetTweets, - ) - // Check for elevated API capabilities if hasElevatedApiKey(apiKeys) { twitterCaps = append(twitterCaps, types.CapSearchByFullArchive) From b9a761535f0bb617556e95d472503b03818c722f Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sun, 19 Oct 2025 22:51:55 +0200 Subject: [PATCH 126/136] fix: test --- internal/capabilities/detector_test.go | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/internal/capabilities/detector_test.go b/internal/capabilities/detector_test.go index c1d8785..82c017c 100644 --- a/internal/capabilities/detector_test.go +++ b/internal/capabilities/detector_test.go @@ -111,9 +111,9 @@ var _ = Describe("DetectCapabilities", func() { ), Entry("With Twitter API keys", config.JobConfiguration{ - "twitter_api_keys": []string{"key1"}, + "twitter_api_keys": []string{"key1"}, // Key not valid }, - []string{"telemetry", "tiktok", "twitter"}, + []string{"telemetry", "tiktok"}, ), ) }) @@ -175,9 +175,3 @@ var _ = Describe("DetectCapabilities", func() { }) }) }) - -// Helper function to check if a job type exists in capabilities -func hasJobType(capabilities types.WorkerCapabilities, jobName string) bool { - _, exists := capabilities[types.JobType(jobName)] - return exists -} From 8e9d35d28dcf363e976542ac8b76ca2a6bfa6a25 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sun, 19 Oct 2025 23:34:44 +0200 Subject: [PATCH 127/136] fix: default twitter --- api/args/twitter/search/search.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/api/args/twitter/search/search.go b/api/args/twitter/search/search.go index 8310666..2f44b39 100644 --- a/api/args/twitter/search/search.go +++ b/api/args/twitter/search/search.go @@ -18,7 +18,8 @@ var ( ) const ( - MaxResults = 1000 + MaxResults = 1000 + DefaultMaxResults = 10 ) // Verify interface implementation @@ -48,7 +49,7 @@ func (t *Arguments) UnmarshalJSON(data []byte) error { // SetDefaultValues sets default values for the arguments func (t *Arguments) SetDefaultValues() { if t.MaxResults == 0 { - t.MaxResults = MaxResults + t.MaxResults = DefaultMaxResults } } @@ -104,9 +105,7 @@ func (t *Arguments) IsSingleProfileOperation() bool { func (t *Arguments) IsMultipleProfileOperation() bool { c := t.GetCapability() - return c == types.CapGetFollowing || - c == types.CapGetFollowers || - c == types.CapGetRetweeters + return c == types.CapGetRetweeters } func (t *Arguments) IsSingleSpaceOperation() bool { From 9ac4f66fe18856174fd56ab5b7ce89bffb1dc614 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Sun, 19 Oct 2025 23:47:26 +0200 Subject: [PATCH 128/136] chore: better default --- api/types/jobs.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/types/jobs.go b/api/types/jobs.go index c28dff7..0eeacef 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -127,8 +127,8 @@ const ( // Capability group constants for easy reuse var ( - AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry, CapEmpty} - AlwaysAvailableTiktokCaps = []Capability{CapTranscription, CapEmpty} + AlwaysAvailableTelemetryCaps = []Capability{CapTelemetry} + AlwaysAvailableTiktokCaps = []Capability{CapTranscription} // AlwaysAvailableCapabilities defines the job capabilities that are always available regardless of configuration AlwaysAvailableCapabilities = WorkerCapabilities{ @@ -140,7 +140,7 @@ var ( TwitterCaps = []Capability{ CapSearchByQuery, CapSearchByProfile, CapSearchByFullArchive, CapGetById, CapGetReplies, CapGetRetweeters, CapGetTweets, CapGetMedia, CapGetProfileById, - CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, CapEmpty, + CapGetTrends, CapGetFollowing, CapGetFollowers, CapGetSpace, CapGetProfile, } // TiktokSearchCaps are Tiktok capabilities available with Apify @@ -150,7 +150,7 @@ var ( RedditCaps = []Capability{CapScrapeUrls, CapSearchPosts, CapSearchUsers, CapSearchCommunities} // WebCaps are all the Web capabilities (only available with Apify) - WebCaps = []Capability{CapScraper, CapEmpty} + WebCaps = []Capability{CapScraper} // LinkedInCaps are all the LinkedIn capabilities (only available with Apify) LinkedInCaps = []Capability{CapSearchByProfile} From 6794e6ab1c68284ce42d5affa61e7d777ec24ff8 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 00:01:41 +0200 Subject: [PATCH 129/136] fix: tests --- api/args/twitter/search/search_test.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/api/args/twitter/search/search_test.go b/api/args/twitter/search/search_test.go index 2be7f9b..5bd35be 100644 --- a/api/args/twitter/search/search_test.go +++ b/api/args/twitter/search/search_test.go @@ -44,7 +44,7 @@ var _ = Describe("TwitterSearchArguments", func() { Expect(err).ToNot(HaveOccurred()) Expect(args.Query).To(Equal("minimal test")) Expect(args.Count).To(Equal(0)) - Expect(args.MaxResults).To(Equal(1000)) // SetDefaultValues() sets this to MaxResults + Expect(args.MaxResults).To(Equal(10)) // SetDefaultValues() sets this to MaxResults }) It("should fail unmarshal with invalid JSON", func() { @@ -203,18 +203,6 @@ var _ = Describe("TwitterSearchArguments", func() { }) Context("Multiple Profile Operations", func() { - It("should identify getfollowing as multiple profile operation", func() { - args := search.NewArguments() - args.Type = types.CapGetFollowing - Expect(args.IsMultipleProfileOperation()).To(BeTrue()) - }) - - It("should identify getfollowers as multiple profile operation", func() { - args := search.NewArguments() - args.Type = types.CapGetFollowers - Expect(args.IsMultipleProfileOperation()).To(BeTrue()) - }) - It("should identify getretweeters as multiple profile operation", func() { args := search.NewArguments() args.Type = types.CapGetRetweeters From a5a980d7be2fe552e1621efba984593c19c2e770 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 07:55:16 +0200 Subject: [PATCH 130/136] chore: brings over job params too --- api/jobs/base.go | 20 ++++++++++ api/jobs/config.go | 7 ++++ api/jobs/generic.go | 54 +++++++++++++++++++++++++++ api/jobs/hybrid.go | 85 ++++++++++++++++++++++++++++++++++++++++++ api/jobs/linkedin.go | 42 +++++++++++++++++++++ api/jobs/reddit.go | 50 +++++++++++++++++++++++++ api/jobs/similarity.go | 80 +++++++++++++++++++++++++++++++++++++++ api/jobs/tiktok.go | 72 +++++++++++++++++++++++++++++++++++ api/jobs/twitter.go | 42 +++++++++++++++++++++ api/jobs/web.go | 42 +++++++++++++++++++++ api/types/jobs.go | 41 ++++++++++++++++++++ 11 files changed, 535 insertions(+) create mode 100644 api/jobs/base.go create mode 100644 api/jobs/config.go create mode 100644 api/jobs/generic.go create mode 100644 api/jobs/hybrid.go create mode 100644 api/jobs/linkedin.go create mode 100644 api/jobs/reddit.go create mode 100644 api/jobs/similarity.go create mode 100644 api/jobs/tiktok.go create mode 100644 api/jobs/twitter.go create mode 100644 api/jobs/web.go diff --git a/api/jobs/base.go b/api/jobs/base.go new file mode 100644 index 0000000..0ca273d --- /dev/null +++ b/api/jobs/base.go @@ -0,0 +1,20 @@ +package jobs + +import ( + "time" + + "github.com/masa-finance/tee-worker/api/types" +) + +type JobParameters interface { + // Validate returns an error if the arguments are invalid + Validate(cfg *SearchConfig) error + // Type returns the job type + Type() types.JobType + // Arguments converts the job parameter arguments to a Map + Arguments(cfg *SearchConfig) map[string]any + // Timeout() returns the timeout to wait when getting results from the tee-worker. Returning 0 means use the default. + Timeout() time.Duration + // PollInterval() returns how often to poll the tee-worker for a job's results. Returning 0 means use the default. + PollInterval() time.Duration +} diff --git a/api/jobs/config.go b/api/jobs/config.go new file mode 100644 index 0000000..dfd7f13 --- /dev/null +++ b/api/jobs/config.go @@ -0,0 +1,7 @@ +package jobs + +type SearchConfig struct { + DefaultMaxResults uint `env:"DEFAULT_MAX_RESULTS,default=20"` + MinMaxResults uint `env:"MIN_MAX_RESULTS,default=1"` + MaxMaxResults uint `env:"MAX_MAX_RESULTS,default=100"` +} diff --git a/api/jobs/generic.go b/api/jobs/generic.go new file mode 100644 index 0000000..e1f4668 --- /dev/null +++ b/api/jobs/generic.go @@ -0,0 +1,54 @@ +package jobs + +import ( + "encoding/json" + "maps" + "time" + + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/types" +) + +// Compile-time check to ensure GenericParams implements JobParameters +var _ JobParameters = (*GenericParams)(nil) + +// This is a generic Params struct that assumes the args are just a Map, and the validation will be done by the appropriate JSON unmarshaller in tee-types. Note that we're unmarshalling at least twice which will have a (probably heavy) runtime cost. +type GenericParams struct { + JobType types.JobType `json:"type"` + Args map[string]any `json:"arguments"` +} + +func (p GenericParams) Validate(_ *SearchConfig) error { + _, err := args.UnmarshalJobArguments(p.JobType, p.Args) + return err +} + +func (p GenericParams) Arguments(_ *SearchConfig) map[string]any { + // Use UnmarshalJobArguments to get properly typed arguments with correct type + ja, err := args.UnmarshalJobArguments(p.JobType, p.Args) + if err != nil { + // Fallback to original args if unmarshaling fails + return maps.Clone(map[string]any(p.Args)) + } + + // Marshal the properly typed arguments back to map[string]any + // This will include the correct type and all other fields + jsonData, _ := json.Marshal(ja) + var result map[string]any + if err := json.Unmarshal(jsonData, &result); err != nil { + return nil + } + return result +} + +func (p GenericParams) Type() types.JobType { + return p.JobType +} + +func (p GenericParams) Timeout() time.Duration { + return 0 +} + +func (p GenericParams) PollInterval() time.Duration { + return 0 +} diff --git a/api/jobs/hybrid.go b/api/jobs/hybrid.go new file mode 100644 index 0000000..706c260 --- /dev/null +++ b/api/jobs/hybrid.go @@ -0,0 +1,85 @@ +package jobs + +import ( + "fmt" + "slices" + "strings" + "time" + + "github.com/masa-finance/tee-worker/api/types" +) + +type HybridQuery struct { + Query string `json:"query"` + Weight float64 `json:"weight"` +} + +var _ JobParameters = (*HybridSearchParams)(nil) + +// HybridSearchParams defines parameters for hybrid search +// TODO: At some point we could replace `TextQuery` and `SimilarityQuery` with a single slice that receives N queries. The issue right now is that, because of the Milvus API, we can't have an arbitrary number of queries (see https://github.com/milvus-io/milvus/issues/41261). Once that issue is resolved we can fix this. +type HybridSearchParams struct { + TextQuery HybridQuery `json:"text_query"` // Optional, either TextQuery or SimilarityQuery must be specified + SimilarityQuery HybridQuery `json:"similarity_query"` // Mandatory, 1 or more queries to execute + Keywords []string `json:"keywords"` // Optional, keywords to filter for in keyword search + Operator string `json:"keyword_operator"` // Optional, operator ("and" / "or") to use in keyword search. Default is "and" + MaxResults int `json:"max_results"` // Optional, max number of results + Sources []types.Source `json:"sources"` +} + +// Validate validates the hybrid search parameters +func (t HybridSearchParams) Validate(cfg *SearchConfig) error { + if t.TextQuery.Weight <= 0 || t.TextQuery.Weight > 1 || t.SimilarityQuery.Weight <= 0 || t.SimilarityQuery.Weight > 1 { + return fmt.Errorf("weights must be greater than or equal to 0, and less than 1, got %f and %f", t.TextQuery.Weight, t.SimilarityQuery.Weight) + } + + op := strings.ToLower(t.Operator) + if op != "and" && op != "or" && op != "" { + return fmt.Errorf(`keyword_operator must be "and", "or" or "", not "%s"`, t.Operator) + } + + for _, s := range t.Sources { + if !slices.Contains(types.Sources, s) { + return fmt.Errorf("source must be one of %v, got %s", types.Sources, s) + } + } + + return nil +} + +func (t HybridSearchParams) Timeout() time.Duration { + return 0 +} + +func (t HybridSearchParams) PollInterval() time.Duration { + return 0 +} + +func (t HybridSearchParams) Arguments(cfg *SearchConfig) map[string]any { + t.ApplyDefaults(cfg) + + return map[string]any{ + "text_query": t.TextQuery, + "similarity_query": t.SimilarityQuery, + "keywords": t.Keywords, + "operator": t.Operator, + "max_results": t.MaxResults, + "sources": t.Sources, + } +} + +func (t HybridSearchParams) Type() types.JobType { + return "hybrid-search" +} + +// ApplyDefaults applies default values to the hybrid search parameters +func (t *HybridSearchParams) ApplyDefaults(cfg *SearchConfig) { + switch { + case t.MaxResults == 0: + t.MaxResults = int(cfg.DefaultMaxResults) + case t.MaxResults < int(cfg.MinMaxResults): + t.MaxResults = int(cfg.MinMaxResults) + case t.MaxResults > int(cfg.MaxMaxResults): + t.MaxResults = int(cfg.MaxMaxResults) + } +} diff --git a/api/jobs/linkedin.go b/api/jobs/linkedin.go new file mode 100644 index 0000000..6d07f10 --- /dev/null +++ b/api/jobs/linkedin.go @@ -0,0 +1,42 @@ +package jobs + +import ( + "encoding/json" + "time" + + "github.com/masa-finance/tee-worker/api/args/linkedin" + "github.com/masa-finance/tee-worker/api/types" +) + +// Compile-time check to ensure LinkedInParams implements JobParameters +var _ JobParameters = (*LinkedInParams)(nil) + +type LinkedInParams struct { + JobType types.JobType `json:"type"` + Args linkedin.Profile `json:"arguments"` +} + +func (l LinkedInParams) Validate(cfg *SearchConfig) error { + return l.Args.Validate() +} + +func (l LinkedInParams) Type() types.JobType { + return l.JobType +} + +func (l LinkedInParams) Timeout() time.Duration { + return 0 +} + +func (l LinkedInParams) PollInterval() time.Duration { + return 0 +} + +func (l LinkedInParams) Arguments(cfg *SearchConfig) map[string]any { + jsonData, _ := json.Marshal(l.Args) + var result map[string]any + if err := json.Unmarshal(jsonData, &result); err != nil { + return nil + } + return result +} diff --git a/api/jobs/reddit.go b/api/jobs/reddit.go new file mode 100644 index 0000000..c3b6ae5 --- /dev/null +++ b/api/jobs/reddit.go @@ -0,0 +1,50 @@ +package jobs + +import ( + "encoding/json" + "time" + + "github.com/masa-finance/tee-worker/api/args/reddit" + "github.com/masa-finance/tee-worker/api/types" +) + +// Compile-time check to ensure RedditParams implements JobParameters +var _ JobParameters = (*RedditParams)(nil) + +type RedditParams struct { + JobType types.JobType `json:"type"` // Type of search: 'reddit' + Args reddit.Search `json:"arguments"` // Scrape arguments +} + +func (r RedditParams) Validate(cfg *SearchConfig) error { + return r.Args.Validate() +} + +func (r RedditParams) Type() types.JobType { + return r.JobType +} + +func (r RedditParams) Timeout() time.Duration { + if r.Args.Type == types.CapSearchCommunities { + // Apify communities search takes 3-4 minutes + return 5 * time.Minute + } + return 0 +} + +func (r RedditParams) PollInterval() time.Duration { + if r.Args.Type == types.CapSearchCommunities { + // Apify communities search takes 3-4 minutes, so don't poll as often + return 5 * time.Second + } + return 0 +} + +func (r RedditParams) Arguments(cfg *SearchConfig) map[string]any { + jsonData, _ := json.Marshal(r.Args) + var result map[string]any + if err := json.Unmarshal(jsonData, &result); err != nil { + return nil + } + return result +} diff --git a/api/jobs/similarity.go b/api/jobs/similarity.go new file mode 100644 index 0000000..2d9abb8 --- /dev/null +++ b/api/jobs/similarity.go @@ -0,0 +1,80 @@ +package jobs + +import ( + "errors" + "fmt" + "slices" + "strings" + "time" + + "github.com/masa-finance/tee-worker/api/types" +) + +var _ JobParameters = (*SimilaritySearchParams)(nil) + +type SimilaritySearchParams struct { + Query string `json:"query"` // Mandatory, query for similarity search in keyword search + Keywords []string `json:"keywords"` // Optional, keywords to filter for in keyword search + KeywordOperator string `json:"keyword_operator"` // Optional, operator ("and" / "or") to use in keyword search. Default is "and" + Sources []types.Source `json:"sources"` // Optional, sources to query + MaxResults int `json:"max_results"` // Optional, max number of results for keyword search +} + +func (t SimilaritySearchParams) Validate(cfg *SearchConfig) error { + if t.Query == "" { + return errors.New("query is required") + } + + t.KeywordOperator = strings.ToLower(t.KeywordOperator) + if t.KeywordOperator != "and" && t.KeywordOperator != "or" && t.KeywordOperator != "" { + return fmt.Errorf(`keyword_operator must be "and", "or" or "", not "%s"`, t.KeywordOperator) + } + + for _, s := range t.Sources { + if !slices.Contains(types.Sources, s) { + return fmt.Errorf("source must be one of %v, got %s", types.Sources, s) + } + } + + return nil +} + +func (t SimilaritySearchParams) Timeout() time.Duration { + return 0 +} + +func (t SimilaritySearchParams) PollInterval() time.Duration { + return 0 +} + +func (t SimilaritySearchParams) Type() types.JobType { + return "similarity-search" +} + +func (t SimilaritySearchParams) Arguments(cfg *SearchConfig) map[string]any { + t.ApplyDefaults(cfg) + + return map[string]any{ + "query": t.Query, + "keywords": t.Keywords, + "keyword_operator": strings.ToLower(t.KeywordOperator), + "max_results": t.MaxResults, + "sources": t.Sources, + } +} + +func (t *SimilaritySearchParams) ApplyDefaults(cfg *SearchConfig) { + switch { + case t.MaxResults == 0: + t.MaxResults = int(cfg.DefaultMaxResults) + case t.MaxResults < int(cfg.MinMaxResults): + t.MaxResults = int(cfg.MinMaxResults) + case t.MaxResults > int(cfg.MaxMaxResults): + t.MaxResults = int(cfg.MaxMaxResults) + } + + if t.KeywordOperator == "" { + t.KeywordOperator = "and" + } + t.KeywordOperator = strings.ToLower(t.KeywordOperator) +} diff --git a/api/jobs/tiktok.go b/api/jobs/tiktok.go new file mode 100644 index 0000000..cf4aee7 --- /dev/null +++ b/api/jobs/tiktok.go @@ -0,0 +1,72 @@ +package jobs + +import ( + "encoding/json" + "maps" + "time" + + "github.com/masa-finance/tee-worker/api/args" + "github.com/masa-finance/tee-worker/api/args/tiktok" + "github.com/masa-finance/tee-worker/api/types" +) + +// Compile-time check to ensure TikTokParams implements JobParameters +var _ JobParameters = (*TikTokParams)(nil) + +type TikTokTranscriptionParams struct { + JobType types.JobType `json:"type"` + Args tiktok.Transcription `json:"arguments"` +} + +type TikTokSearchParams struct { + JobType types.JobType `json:"type"` + Args tiktok.Query `json:"arguments"` +} + +type TikTokTrendingParams struct { + JobType types.JobType `json:"type"` + Args tiktok.Trending `json:"arguments"` +} + +// TikTokArguments is a flexible map that supports multiple unique capabilities +type TikTokArguments map[string]any + +type TikTokParams struct { + JobType types.JobType `json:"type"` + Args TikTokArguments `json:"arguments"` +} + +func (t TikTokParams) Type() types.JobType { + return t.JobType +} + +func (t TikTokParams) Validate(cfg *SearchConfig) error { + _, err := args.UnmarshalJobArguments(t.JobType, t.Args) + return err +} + +func (t TikTokParams) Timeout() time.Duration { + return 0 +} + +func (t TikTokParams) PollInterval() time.Duration { + return 0 +} + +func (t TikTokParams) Arguments(cfg *SearchConfig) map[string]any { + // Use UnmarshalJobArguments to get properly typed arguments with correct type + ja, err := args.UnmarshalJobArguments(types.TiktokJob, t.Args) + if err != nil { + // Fallback to original args if unmarshaling fails + return maps.Clone(map[string]any(t.Args)) + } + + // Marshal the properly typed arguments back to map[string]any + // This will include the correct type and all other fields + jsonData, _ := json.Marshal(ja) + var result map[string]any + if err := json.Unmarshal(jsonData, &result); err != nil { + return nil + } + return result +} diff --git a/api/jobs/twitter.go b/api/jobs/twitter.go new file mode 100644 index 0000000..64a7848 --- /dev/null +++ b/api/jobs/twitter.go @@ -0,0 +1,42 @@ +package jobs + +import ( + "encoding/json" + "time" + + "github.com/masa-finance/tee-worker/api/args/twitter" + "github.com/masa-finance/tee-worker/api/types" +) + +// Compile-time check to ensure TwitterParams implements JobParameters +var _ JobParameters = (*TwitterParams)(nil) + +type TwitterParams struct { + JobType types.JobType `json:"type"` // Any of the Twitter* job types + Args twitter.Search `json:"arguments"` // Search arguments +} + +func (t TwitterParams) Validate(cfg *SearchConfig) error { + return t.Args.Validate() +} + +func (t TwitterParams) Type() types.JobType { + return t.JobType +} + +func (t TwitterParams) Timeout() time.Duration { + return 0 +} + +func (t TwitterParams) PollInterval() time.Duration { + return 0 +} + +func (t TwitterParams) Arguments(cfg *SearchConfig) map[string]any { + jsonData, _ := json.Marshal(t.Args) + var result map[string]any + if err := json.Unmarshal(jsonData, &result); err != nil { + return nil + } + return result +} diff --git a/api/jobs/web.go b/api/jobs/web.go new file mode 100644 index 0000000..5bec345 --- /dev/null +++ b/api/jobs/web.go @@ -0,0 +1,42 @@ +package jobs + +import ( + "encoding/json" + "time" + + "github.com/masa-finance/tee-worker/api/args/web" + "github.com/masa-finance/tee-worker/api/types" +) + +// Compile-time check to ensure WebParams implements JobParameters +var _ JobParameters = (*WebParams)(nil) + +type WebParams struct { + JobType types.JobType `json:"type"` + Args web.Page `json:"arguments"` +} + +func (w WebParams) Validate(cfg *SearchConfig) error { + return w.Args.Validate() +} + +func (w WebParams) Timeout() time.Duration { + return 0 +} + +func (w WebParams) Type() types.JobType { + return w.JobType +} + +func (w WebParams) PollInterval() time.Duration { + return 0 +} + +func (w WebParams) Arguments(cfg *SearchConfig) map[string]any { + jsonData, _ := json.Marshal(w.Args) + var result map[string]any + if err := json.Unmarshal(jsonData, &result); err != nil { + return nil + } + return result +} diff --git a/api/types/jobs.go b/api/types/jobs.go index 0eeacef..e11c8fd 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -3,12 +3,53 @@ package types import ( "encoding/json" "fmt" + "maps" "slices" "time" "github.com/masa-finance/tee-worker/pkg/util" ) +// note, this could be combined with job type in a future PR / refactor... +type Source string + +func (j Source) String() string { + return string(j) +} + +// To add a new RouterType and/or JobType, you will need to add the JobType to tee-types and the Source below, and add the new mapping to the SourceFor function. This is necessary basically because of Twitter, which has 3 JobTypes but a single Router. +const ( + TwitterSource Source = "twitter" + WebSource Source = "web" + TiktokSource Source = "tiktok" + RedditSource Source = "reddit" + LinkedInSource Source = "linkedin" + TelemetrySource Source = "telemetry" + UnknownSource Source = "" +) + +const UnknownJob = JobType("") + +var sourceMap = map[JobType]Source{ + TwitterJob: TwitterSource, + WebJob: WebSource, + TiktokJob: TiktokSource, + RedditJob: RedditSource, + LinkedInJob: LinkedInSource, + TelemetryJob: TelemetrySource, + UnknownJob: UnknownSource, +} + +var Sources = slices.Compact(slices.Sorted(maps.Values(sourceMap))) + +func SourceFor(j JobType) Source { + source, ok := sourceMap[j] + if ok { + return source + } + return UnknownSource +} + type JobType string type Capability string From 7c8abdd265adf5d1632c7a416631e831a35fdcfe Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 07:59:23 +0200 Subject: [PATCH 131/136] chore: add result response --- api/types/jobs.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/api/types/jobs.go b/api/types/jobs.go index e11c8fd..8fc6631 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -274,3 +274,8 @@ type Key struct { type KeyResponse struct { Status string `json:"status"` } + +type ResultResponse struct { + UUID string `json:"uuid"` + Error string `json:"error"` +} From b0a69385b9e93a281a726ae3ae0ea3103aa51a7e Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 08:02:02 +0200 Subject: [PATCH 132/136] chore: adds document --- api/types/jobs.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/api/types/jobs.go b/api/types/jobs.go index 8fc6631..a854d88 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -279,3 +279,20 @@ type ResultResponse struct { UUID string `json:"uuid"` Error string `json:"error"` } + +// Document represents a document stored in the vector store. We need to put it in this package because of circular dependencies. +type Document struct { + Id string `json:"id"` + Source Source `json:"source"` + Content string `json:"content"` + Metadata map[string]any `json:"metadata"` + Embedding []float32 `json:"embedding,omitempty"` + Score float32 `json:"score,omitempty"` // For similarity search results + UpdatedAt time.Time `json:"updated_at"` + // SearchText is used only for embedding/indexing and SHOULD NOT be serialized or stored. + SearchText string `json:"-"` +} + +func (d Document) String() string { + return fmt.Sprintf("%s/%s\n%s\n%s", d.Source, d.Id, d.Metadata, d.Content) +} From a4d25bd3bd1e1f0cc56725612c3ac401fd3bcd42 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 08:06:05 +0200 Subject: [PATCH 133/136] chore: collection stats --- api/types/jobs.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/api/types/jobs.go b/api/types/jobs.go index a854d88..8274d56 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -296,3 +296,9 @@ type Document struct { func (d Document) String() string { return fmt.Sprintf("%s/%s\n%s\n%s", d.Source, d.Id, d.Metadata, d.Content) } + +// CollectionStats represents collection statistics from Milvus +type CollectionStats struct { + CollectionName string `json:"collection_name,omitempty"` + RowCount uint `json:"row_count"` +} From f96526c69a36845df27f8291d729b88aeafe2774 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 08:21:51 +0200 Subject: [PATCH 134/136] chore: add to jobs --- api/types/jobs.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/api/types/jobs.go b/api/types/jobs.go index 8274d56..c12382a 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -10,6 +10,26 @@ import ( "github.com/masa-finance/tee-worker/pkg/util" ) +type JobStatus string + +func (j JobStatus) String() string { + return string(j) +} + +const ( + JobStatusNotSaved JobStatus = "done(not saved)" + JobStatusSaved JobStatus = "done(saved)" + JobStatusDone JobStatus = "done" + JobStatusActive JobStatus = "in progress" + JobStatusReceived JobStatus = "received" + JobStatusError JobStatus = "error" + JobStatusRetryError JobStatus = "error(retrying)" +) + +func (j JobStatus) IsDone() bool { + return j == JobStatusSaved || j == JobStatusDone || j == JobStatusNotSaved +} + // note, this could be combined with job type in a future PR / refactor... type Source string From 7b8cfb981f855cf5a061959bb07f716625819c73 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Mon, 20 Oct 2025 18:42:14 +0200 Subject: [PATCH 135/136] chore: bring over indexer job result --- api/types/jobs.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/api/types/jobs.go b/api/types/jobs.go index c12382a..726c672 100644 --- a/api/types/jobs.go +++ b/api/types/jobs.go @@ -322,3 +322,10 @@ type CollectionStats struct { CollectionName string `json:"collection_name,omitempty"` RowCount uint `json:"row_count"` } + +// JobResult is the struct that is stored in the NATS KV store +type IndexerJobResult struct { + Status JobStatus `json:"status"` + Docs []Document `json:"docs,omitempty"` + Error string `json:"error"` +} From ffeb93f450176f3b60f4c568aa50e5cf607dcc98 Mon Sep 17 00:00:00 2001 From: grantdfoster Date: Tue, 21 Oct 2025 00:22:02 +0200 Subject: [PATCH 136/136] chore: bump version --- internal/versioning/version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/versioning/version.go b/internal/versioning/version.go index 3e5dc6d..8cd7f8b 100644 --- a/internal/versioning/version.go +++ b/internal/versioning/version.go @@ -5,5 +5,5 @@ var ( // XXX: Bump this value only when there are protocol changes that makes the oracle // incompatible between version! - TEEWorkerVersion = `delta` + TEEWorkerVersion = `epsilon` )