Skip to content

Commit

Permalink
Fetch method store retrieve (#20)
Browse files Browse the repository at this point in the history
* Add new type "resource.FetchClient" to represent the client used to scrape html
* Add new field to resource.WebPage FetchMethod with value as ^^
* Populate field in TrafiliaturaFetcher.applyExtractResult()
* Store/Retrieve in SQLStorage.Fetch() and SQLStorage.Save()
  • Loading branch information
efixler committed May 5, 2024
1 parent a69d92a commit f4a3e4f
Show file tree
Hide file tree
Showing 15 changed files with 338 additions and 164 deletions.
7 changes: 7 additions & 0 deletions fetch/client.go
Expand Up @@ -8,6 +8,8 @@ import (
"net/http"
"path/filepath"
"time"

"github.com/efixler/scrape/resource"
)

const (
Expand All @@ -16,6 +18,7 @@ const (

type Client interface {
Get(url string, headers http.Header) (*http.Response, error)
Identifier() resource.FetchClient
}

type ClientOption func(*defaultClient) error
Expand Down Expand Up @@ -46,6 +49,10 @@ type defaultClient struct {
httpClient *http.Client
}

func (c defaultClient) Identifier() resource.FetchClient {
return resource.DefaultClient
}

func (c defaultClient) Get(url string, headers http.Header) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
Expand Down
131 changes: 131 additions & 0 deletions fetch/trafilatura/apply_result_test.go
@@ -0,0 +1,131 @@
package trafilatura

import (
"errors"
nurl "net/url"
"slices"
"strings"
"testing"
"time"

"github.com/efixler/scrape/fetch"
"github.com/efixler/scrape/resource"
"github.com/markusmobius/go-trafilatura"
)

func basicTrafilaturaResult() trafilatura.ExtractResult {
return trafilatura.ExtractResult{
ContentText: "T content text",
Metadata: trafilatura.Metadata{
URL: "https://trafilatura.com/canonical",
Title: "T title",
Author: "author1;author2",
Hostname: "trafilatura.com",
Description: "T description",
Sitename: "T sitename",
Date: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC),
Categories: []string{"T cat1", "T cat2"},
Tags: []string{"T tag1", "T tag2"},
Language: "fr",
Image: "https://trafilatura.com/image.jpg",
PageType: "T article",
License: "T CC-BY-SA",
},
}
}

func TestMergeTrafilaturaResult(t *testing.T) {
page := basicWebPage()
tfc, _ := New(fetch.MustClient())
tr := basicTrafilaturaResult()
tfc.applyExtractResult(&tr, &page)
if page.ContentText != tr.ContentText {
t.Errorf("ContentText mismatch: %s != %s", page.ContentText, tr.ContentText)
}
if page.CanonicalURL.String() != tr.Metadata.URL {
t.Errorf("CanonicalURL mismatch: %s != %s", page.CanonicalURL, tr.Metadata.URL)
}
if page.Title != tr.Metadata.Title {
t.Errorf("Title mismatch: %s != %s", page.Title, tr.Metadata.Title)
}
if strings.Join(page.Authors, ";") != tr.Metadata.Author {
t.Errorf("Authors mismatch: %v != %v", page.Authors, tr.Metadata.Author)
}
if page.Hostname != tr.Metadata.Hostname {
t.Errorf("Hostname mismatch: %s != %s", page.Hostname, tr.Metadata.Hostname)
}
if page.Description != tr.Metadata.Description {
t.Errorf("Description mismatch: %s != %s", page.Description, tr.Metadata.Description)
}
if page.Sitename != tr.Metadata.Sitename {
t.Errorf("Sitename mismatch: %s != %s", page.Sitename, tr.Metadata.Sitename)
}
if page.Date.Compare(tr.Metadata.Date) != 0 {
t.Errorf("Date mismatch: %s != %s", page.Date, tr.Metadata.Date)
}
if !slices.Equal(page.Categories, tr.Metadata.Categories) {
t.Errorf("Categories mismatch: %v != %v", page.Categories, tr.Metadata.Categories)
}
if !slices.Equal(page.Tags, tr.Metadata.Tags) {
t.Errorf("Tags mismatch: %v != %v", page.Tags, tr.Metadata.Tags)
}
if page.Language != tr.Metadata.Language {
t.Errorf("Language mismatch: %s != %s", page.Language, tr.Metadata.Language)
}
if page.Image != tr.Metadata.Image {
t.Errorf("Image mismatch: %s != %s", page.Image, tr.Metadata.Image)
}
if page.PageType != tr.Metadata.PageType {
t.Errorf("PageType mismatch: %s != %s", page.PageType, tr.Metadata.PageType)
}
if page.FetchMethod != resource.DefaultClient {
t.Errorf("FetchMethod should be set to default client, got: %s", page.FetchMethod)
}
}

func TestEmptyAuthorNotSaved(t *testing.T) {
page := basicWebPage()
page.Authors = nil
tfc, _ := New(fetch.MustClient())
tr := basicTrafilaturaResult()
tr.Metadata.Author = ""
tfc.applyExtractResult(&tr, &page)
if page.Authors == nil {
t.Errorf("Authors was nil, expected empty array")
}
if len(page.Authors) != 0 {
t.Errorf("Empty author should not be saved: %q", page.Authors)
}
}

// Returns a WebPage will all fields filled out. The caller can override
// fields as needed.
func basicWebPage() resource.WebPage {
requestedUrl, _ := nurl.Parse("https://example.com/requested")
canonicalUrl, _ := nurl.Parse("https://example.com/canonical")
fetchTime := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)
return resource.WebPage{
RequestedURL: requestedUrl,
CanonicalURL: canonicalUrl,
OriginalURL: "https://example.com/original",
// TTL: ttl, // skip ttl for now
FetchTime: &fetchTime,
Hostname: "example.com",
StatusCode: 200,
Error: errors.New("an error occurred"),
Title: "A title",
Description: "A description",
Sitename: "A sitename",
Authors: []string{"author1", "author2"},
Date: &fetchTime,
Categories: []string{"cat1", "cat2"},
Tags: []string{"tag1", "tag2"},
Language: "en",
Image: "https://example.com/image.jpg",
PageType: "article",
License: "CC-BY-SA",
ID: "1234",
Fingerprint: "fingerprint",
ContentText: "This is the content text",
}
}
32 changes: 31 additions & 1 deletion fetch/trafilatura/fetcher.go
Expand Up @@ -6,6 +6,7 @@ import (
"log/slog"
"mime"
nurl "net/url"
"strings"

"github.com/efixler/scrape/fetch"
"github.com/efixler/scrape/resource"
Expand Down Expand Up @@ -104,10 +105,39 @@ func (f *TrafilaturaFetcher) Fetch(url *nurl.URL) (*resource.WebPage, error) {
// "text and comments are not long enough: 0 0"
return rval, err
}
rval.MergeTrafilaturaResult(result)
f.applyExtractResult(result, rval)
return rval, nil
}

func (f *TrafilaturaFetcher) applyExtractResult(
tr *trafilatura.ExtractResult,
r *resource.WebPage,
) {
r.ContentText = tr.ContentText
r.CanonicalURL, _ = nurl.Parse(tr.Metadata.URL)
r.Title = tr.Metadata.Title
r.Authors = make([]string, 0, 1)
authors := strings.Split(tr.Metadata.Author, ";")
for _, a := range authors {
if trimmed := strings.TrimSpace(a); trimmed != "" {
r.Authors = append(r.Authors, trimmed)
}
}
r.Hostname = tr.Metadata.Hostname
r.Description = tr.Metadata.Description
r.Sitename = tr.Metadata.Sitename
if !tr.Metadata.Date.IsZero() {
r.Date = &tr.Metadata.Date
}
r.Categories = tr.Metadata.Categories
r.Tags = tr.Metadata.Tags
r.License = tr.Metadata.License
r.Language = tr.Metadata.Language
r.Image = tr.Metadata.Image
r.PageType = tr.Metadata.PageType
r.FetchMethod = f.client.Identifier()
}

func (f *TrafilaturaFetcher) Close() error {
return nil
}
5 changes: 5 additions & 0 deletions internal/headless/fetch_client.go
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/efixler/headless"
"github.com/efixler/headless/browser"
"github.com/efixler/scrape/fetch"
"github.com/efixler/scrape/resource"
)

type client struct {
Expand Down Expand Up @@ -38,6 +39,10 @@ func NewChromeClient(ctx context.Context, userAgent string, maxConcurrent int) (
return c, nil
}

func (c client) Identifier() resource.FetchClient {
return resource.HeadlessChrome
}

func (c *client) Get(url string, headers http.Header) (*http.Response, error) {
tab, err := c.browser.AcquireTab()
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion internal/server/routes.go
Expand Up @@ -161,7 +161,7 @@ func extractWithFetcher(fetcher fetch.URLFetcher) http.HandlerFunc {
w.WriteHeader(http.StatusUnprocessableEntity)
}
}
page.FetchMethod = resource.Headless
page.FetchMethod = resource.HeadlessChrome
encoder := json.NewEncoder(w)
encoder.SetEscapeHTML(false)
if req.PrettyPrint {
Expand Down
12 changes: 6 additions & 6 deletions internal/server/routes_test.go
Expand Up @@ -197,7 +197,7 @@ func TestHeadless503WhenUnavailable(t *testing.T) {
}

type mockUrlFetcher struct {
fetchMethod resource.FetchMethod
fetchMethod resource.FetchClient
}

func (m *mockUrlFetcher) Open(ctx context.Context) error { return nil }
Expand All @@ -216,26 +216,26 @@ func (m *mockUrlFetcher) Fetch(url *nurl.URL) (*resource.WebPage, error) {

func TestSingleHandler(t *testing.T) {
ss := &scrapeServer{
urlFetcher: &mockUrlFetcher{},
headlessFetcher: &mockUrlFetcher{fetchMethod: resource.Headless},
urlFetcher: &mockUrlFetcher{fetchMethod: resource.DefaultClient},
headlessFetcher: &mockUrlFetcher{fetchMethod: resource.HeadlessChrome},
}
tests := []struct {
name string
url string
handler http.HandlerFunc
expectMethod resource.FetchMethod
expectMethod resource.FetchClient
}{
{
name: "client",
url: "http://foo.bar",
handler: ss.singleHandler(),
expectMethod: resource.Client,
expectMethod: resource.DefaultClient,
},
{
name: "headless",
url: "http://example.com",
handler: ss.singleHeadlessHandler(),
expectMethod: resource.Headless,
expectMethod: resource.HeadlessChrome,
},
}

Expand Down
3 changes: 2 additions & 1 deletion internal/storage/mysql/create_test.go
Expand Up @@ -21,7 +21,7 @@ func testDatabaseForCreate(t *testing.T) *Store {
// todo: enable alternate names when also creating
// the database.
t.Cleanup(func() {
if _, err := db.Exec("DROP DATABASE IF EXISTS scrape_test;"); err != nil {
if _, err := db.DB.Exec("DROP DATABASE IF EXISTS scrape_test;"); err != nil {
t.Logf("error dropping test mysql database: %q", err)
}
if err := db.Close(); err != nil {
Expand All @@ -32,6 +32,7 @@ func testDatabaseForCreate(t *testing.T) *Store {
}

func TestCreate(t *testing.T) {
// t.Skip("skipping mysql create test")
db := testDatabaseForCreate(t)
err := db.Open(context.Background())
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions internal/storage/mysql/options.go
Expand Up @@ -112,6 +112,7 @@ func defaultConfig() Config {
cfg.WriteTimeout = DefaultWriteTimeout // I/O write timeout
cfg.ParseTime = true
cfg.MultiStatements = true
cfg.Params = map[string]string{"autocommit": "true"}
return Config{
Config: *cfg,
queryTimeout: DefaultQueryTimeout,
Expand Down
6 changes: 3 additions & 3 deletions internal/storage/mysql_loader_test.go
Expand Up @@ -16,7 +16,7 @@ import (

const (
testSchema = "scrape_test"
dbURL = "root:@tcp(127.0.0.1:3306)/?collation=utf8mb4_0900_ai_ci&multiStatements=true&parseTime=true&readTimeout=30s&timeout=10s&writeTimeout=30s"
dbURL = "root:@tcp(127.0.0.1:3306)/?collation=utf8mb4_0900_ai_ci&multiStatements=true&parseTime=true&readTimeout=30s&timeout=10s&writeTimeout=30s&autocommit=1;"
)

//go:embed mysql/create.sql
Expand Down Expand Up @@ -55,8 +55,8 @@ func getTestDatabase(t *testing.T) *SQLStorage {
}
t.Cleanup(func() {
q := fmt.Sprintf("DROP DATABASE %v;", dbConfig.TargetSchema)
if _, err := db.Exec(q); err != nil {
t.Logf("error dropping mysql test database: %v", err)
if _, err := db.DB.Exec(q); err != nil {
t.Logf("error dropping mysql test database %q: %v", dbConfig.TargetSchema, err)
}

})
Expand Down
13 changes: 10 additions & 3 deletions internal/storage/storage.go
Expand Up @@ -26,10 +26,10 @@ const (
)

const (
qSave = `REPLACE INTO urls (id, url, parsed_url, fetch_time, expires, metadata, content_text) VALUES (?, ?, ?, ?, ?, ?, ?);`
qSave = `REPLACE INTO urls (id, url, parsed_url, fetch_time, expires, metadata, content_text, fetch_method) VALUES (?, ?, ?, ?, ?, ?, ?, ?);`
qSaveId = `REPLACE INTO id_map (requested_id, canonical_id) VALUES (?, ?)`
qLookupId = `SELECT canonical_id FROM id_map WHERE requested_id = ?`
qFetch = `SELECT url, parsed_url, fetch_time, expires, metadata, content_text FROM urls WHERE id = ?`
qFetch = `SELECT url, parsed_url, fetch_time, expires, metadata, content_text, fetch_method FROM urls WHERE id = ?`
qDelete = `DELETE FROM urls WHERE id = ?`
qClear = `DELETE FROM urls; DELETE FROM id_map;`
// qClearId = `DELETE FROM id_map where canonical_id = ?`
Expand Down Expand Up @@ -65,12 +65,16 @@ func (s *SQLStorage) Save(uptr *resource.WebPage) (uint64, error) {
}
expireTime, _ := uptr.ExpireTime()
key := Key(uptr.CanonicalURL)
// We need the copy here becauase uptr might be getting returned
// to a client concurrently and the skipMap can be applied inadvertently
// in both places
ucopy := *uptr
ucopy.SkipWhenMarshaling(
resource.CanonicalURL,
resource.ContentText,
resource.OriginalURL,
resource.FetchTime,
resource.FetchMethod,
)
metadata, err := ucopy.MarshalJSON()
if err != nil {
Expand All @@ -84,6 +88,7 @@ func (s *SQLStorage) Save(uptr *resource.WebPage) (uint64, error) {
expireTime.Unix(),
string(metadata),
uptr.ContentText,
int(uptr.FetchMethod),
}

stmt, err := s.Statement(save, func(ctx context.Context, db *sql.DB) (*sql.Stmt, error) {
Expand Down Expand Up @@ -166,8 +171,9 @@ func (s SQLStorage) Fetch(url *nurl.URL) (*resource.WebPage, error) {
expiryEpoch int64
metadata string
contentText string
fetchMethod resource.FetchClient
)
err = rows.Scan(&canonicalUrl, &parsedUrl, &fetchEpoch, &expiryEpoch, &metadata, &contentText)
err = rows.Scan(&canonicalUrl, &parsedUrl, &fetchEpoch, &expiryEpoch, &metadata, &contentText, &fetchMethod)
if err != nil {
return nil, err
}
Expand All @@ -191,6 +197,7 @@ func (s SQLStorage) Fetch(url *nurl.URL) (*resource.WebPage, error) {
ttl := exptime.Sub(fetchTime)
page.TTL = ttl
page.ContentText = contentText
page.FetchMethod = fetchMethod
return page, nil
}

Expand Down

0 comments on commit f4a3e4f

Please sign in to comment.