Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,13 @@ snippets.txt
dist/
bp-todo.md

# Test files
.masa/.env
.masa/*

# TEE
tee/private.pem
.aider*

.masa/*
# LLML
.aider*
GEMINI.md
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ docker-build-test: tee/private.pem
@docker build --target=dependencies --build-arg baseimage=builder --secret id=private_key,src=./tee/private.pem -t $(TEST_IMAGE) -f Dockerfile .

ci-test:
@go test -coverprofile=coverage/coverage.txt -covermode=atomic -v $(TEST_ARGS)
go test -coverprofile=coverage/coverage.txt -covermode=atomic -v $(TEST_ARGS)

.PHONY: test
test: docker-build-test
@docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -coverprofile=coverage/coverage.txt -covermode=atomic -v $(TEST_ARGS)
docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -coverprofile=coverage/coverage.txt -covermode=atomic -v $(TEST_ARGS)

test-capabilities: docker-build-test
@docker run --user root $(ENV_FILE_ARG) -e LOG_LEVEL=debug -v $(PWD)/coverage:/app/coverage --rm --workdir /app $(TEST_IMAGE) go test -coverprofile=coverage/coverage-capabilities.txt -covermode=atomic -v ./internal/capabilities
Expand All @@ -79,8 +79,11 @@ test-twitter: docker-build-test
test-tiktok: docker-build-test
@docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/tiktok_transcription_test.go ./internal/jobs/jobs_suite_test.go

test-reddit: docker-build-test
@docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/reddit_test.go ./internal/jobs/redditapify/client_test.go ./api/types/reddit/reddit_suite_test.go

test-web: docker-build-test
@docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/webscraper_test.go ./internal/jobs/jobs_suite_test.go

test-telemetry: docker-build-test
@docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go ./internal/jobs/jobs_suite_test.go
@docker run --user root $(ENV_FILE_ARG) -v $(PWD)/.masa:/home/masa -v $(PWD)/coverage:/app/coverage --rm --workdir /app -e DATA_DIR=/home/masa $(TEST_IMAGE) go test -v ./internal/jobs/telemetry_test.go ./internal/jobs/jobs_suite_test.go
117 changes: 111 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,28 +80,32 @@ The worker automatically detects and exposes capabilities based on available con
- **Sub-capabilities**: `["transcription"]`
- **Requirements**: None (always available)

3. **`reddit`** - Reddit scraping services
- **Sub-capabilities**: `["scrapeurls","searchposts","searchusers","searchcommunities"]`
- **Requirements**: `APIFY_API_KEY` environment variable

**Twitter Services (Configuration-Dependent):**

3. **`twitter-credential`** - Twitter scraping with credentials
4. **`twitter-credential`** - Twitter scraping with credentials
- **Sub-capabilities**: `["searchbyquery", "searchbyfullarchive", "searchbyprofile", "getbyid", "getreplies", "getretweeters", "gettweets", "getmedia", "gethometweets", "getforyoutweets", "getprofilebyid", "gettrends", "getfollowing", "getfollowers", "getspace"]`
- **Requirements**: `TWITTER_ACCOUNTS` environment variable

4. **`twitter-api`** - Twitter scraping with API keys
5. **`twitter-api`** - Twitter scraping with API keys
- **Sub-capabilities**: `["searchbyquery", "getbyid", "getprofilebyid"]` (basic), plus `["searchbyfullarchive"]` for elevated API keys
- **Requirements**: `TWITTER_API_KEYS` environment variable

5. **`twitter`** - General Twitter scraping (uses best available auth)
6. **`twitter`** - General Twitter scraping (uses best available auth)
- **Sub-capabilities**: Dynamic based on available authentication (combines capabilities from credential, API, and Apify depending on what's configured)
- **Requirements**: Either `TWITTER_ACCOUNTS`, `TWITTER_API_KEYS`, or `APIFY_API_KEY`
- **Priority**: For follower/following operations: Apify > Credentials. For search operations: Credentials > API.

6. **`twitter-apify`** - Twitter scraping using Apify's API (requires `APIFY_API_KEY`)
7. **`twitter-apify`** - Twitter scraping using Apify's API (requires `APIFY_API_KEY`)
- **Sub-capabilities**: `["getfollowers", "getfollowing"]`
- **Requirements**: `APIFY_API_KEY` environment variable

**Stats Service (Always Available):**

7. **`telemetry`** - Worker monitoring and stats
8. **`telemetry`** - Worker monitoring and stats
- **Sub-capabilities**: `["telemetry"]`
- **Requirements**: None (always available)

Expand All @@ -119,7 +123,7 @@ SIG=$(curl -s localhost:8080/job/generate \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${API_KEY}" \
-d '{
"type": "web-scraper",
"type": "web",
"arguments": {
"url": "https://example.com",
"depth": 1
Expand Down Expand Up @@ -201,6 +205,107 @@ Transcribes TikTok videos to text.
}
```

#### Reddit Job Types

There are four different types of Reddit searches:

- `scrapeurls`: Gets the content of one or more Reddit URLs
- `searchposts`: Searches posts and comments
- `searchusers`: Searches user profiles
- `searchcommunities`: Searches communities

**Parameters** (all are optional except where noted)

- `urls` (array of object with `url` and `query` keys, required for `scrapeurls`): Each element contains a Reddit URL to scrape together with the method (which by default will be `"GET"`).
- `queries` (array of string, required for all job types except `scrapeurls`): Each element is a string to search for.
- `sort` (string) What to order by. Possible values are `"relevance"`, `"hot"`, `"top"`, `"new"`, `"rising"` and `"comments"`.
- `include_nsfw` (boolean): Whether to include content tagged NSFW. Default is `false`.
- `skip_posts`: (boolean): If `true`, `searchusers` will not return user posts. Default is `false`.
- `after`: (string, ISO8601 timestamp): Only return entries created after this date/time.
- `max_items` (nonnegative integer): How many items to load in the server cache (page through them using the cursor). Default is 10.
- `max_results` (nonnegative integer): How many results to return per page. Default is 10.
- `max_posts` (nonnegative integer): How many results to return per page. Default is 10.
- `max_comments` (nonnegative integer): How many results to return per page maximum. Default is 10.
- `max_communities` (nonnegative integer): How many results to return per page maximum. Default is 2.
- `max_users` (nonnegative integer): How many users to return per page maximum. Default is 2.
- `next_cursor` (string, optional): Pagination cursor.

##### Reddit Search Operations

**`scrapeurls`** - Scrape Reddit URLs

``` json
{
"type": "reddit",
"arguments": {
"type": "scrapeurls",
"urls": [
{
"url": "https://reddit.com/r/ArtificialIntelligence",
"method": "GET"
},
{
"url": "https://reddit.com/u/TheTelegraph"
}
],
"sort": "new",
"include_nsfw": true,
"max_items": 100
}
}
```

**`searchusers`** - Search Reddit users

``` json
{
"type": "reddit",
"arguments": {
"type": "searchusers",
"queries": [
"NASA",
"European Space Agency"
],
"sort": "relevance",
"skip_posts": true,
}
}
```

**`searchposts`** - Search Reddit posts

``` json
{
"type": "reddit",
"arguments": {
"type": "searchposts",
"queries": [
"NASA",
"European Space Agency"
],
"max_items": 100,
"max_results": 10,
"max_posts": 5
}
}
```

**`searchcommunities`** - Search Reddit posts

``` json
{
"type": "reddit",
"arguments": {
"type": "searchcommunities",
"queries": [
"Artificial Intelligence"
],
"max_items": 100,
"max_results": 10,
}
}
```

#### Twitter Job Types

Twitter scraping is available through four job types:
Expand Down
7 changes: 4 additions & 3 deletions api/types/encrypted.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package types

import (
"encoding/json"
"fmt"

"github.com/masa-finance/tee-worker/pkg/tee"
)
Expand All @@ -14,17 +15,17 @@ type EncryptedRequest struct {
func (payload EncryptedRequest) Unseal() (string, error) {
jobRequest, err := tee.Unseal(payload.EncryptedRequest)
if err != nil {
return "", err
return "", fmt.Errorf("error while unsealing the encrypted request: %w", err)
}

job := Job{}
if err := json.Unmarshal(jobRequest, &job); err != nil {
return "", err
return "", fmt.Errorf("error while unmarshalling the job request: %w", err)
}

dat, err := tee.UnsealWithKey(job.Nonce, payload.EncryptedResult)
if err != nil {
return "", err
return "", fmt.Errorf("error while unsealing the job result: %w", err)
}

return string(dat), nil
Expand Down
17 changes: 17 additions & 0 deletions api/types/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ type Job struct {
Timeout time.Duration `json:"timeout"`
}

func (j Job) String() string {
return fmt.Sprintf("UUID: %s Type: %s Arguments: %s", j.UUID, j.Type, j.Arguments)
}

var letterRunes = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+")

func randStringRunes(n int) string {
Expand Down Expand Up @@ -202,3 +206,16 @@ func (jc JobConfiguration) GetTwitterConfig() TwitterScraperConfig {
SkipLoginVerification: jc.GetBool("skip_login_verification", false),
}
}

// RedditConfig represents the configuration needed for Reddit scraping via Apify
type RedditConfig struct {
ApifyApiKey string
}

// GetRedditConfig constructs a RedditConfig directly from the JobConfiguration
// This eliminates the need for JSON marshaling/unmarshaling
func (jc JobConfiguration) GetRedditConfig() RedditConfig {
return RedditConfig{
ApifyApiKey: jc.GetString("apify_api_key", ""),
}
}
Loading
Loading