Colibri

Colibri is an extensible web crawling and scraping framework for Go, used to crawl and extract structured data on the web.

See webextractor.

Installation

 $ go get github.com/gonzxlez/colibri

Do

// Do makes an HTTP request based on the rules.
func (c *Colibri) Do(rules *Rules) (resp Response, err error)

var rawRules = []byte(`{...}`) // Raw Rules ~ JSON 

c := colibri.New()
c.Client = ...    // Required
c.Delay = ...     // Optional
c.RobotsTxt = ... // Optional
c.Parser = ...    // Optional

var rules colibri.Rules
err := json.Unmarshal(rawRules, &rules)
if err != nil {
	panic(err)
} 

resp, err := c.Do(rules)
if err != nil {
	panic(err)
}

fmt.Println("URL:", resp.URL())
fmt.Println("Status code:", resp.StatusCode())
fmt.Println("Content-Type", resp.Header().Get("Content-Type"))

Extract

// Extract makes the HTTP request and parses the content of the response based on the rules.
func (c *Colibri) Extract(rules *Rules) (output *Output, err error)

var rawRules = []byte(`{...}`) // Raw Rules ~ JSON 

c := colibri.New()
c.Client = ...    // Required
c.Delay = ...     // Optional
c.RobotsTxt = ... // Optional
c.Parser = ...    // Required

var rules colibri.Rules
err := json.Unmarshal(rawRules, &rules)
if err != nil {
	panic(err)
} 

output, err := c.Extract(&rules)
if err != nil {
	panic(err)
}

fmt.Println("URL:", output.Response.URL())
fmt.Println("Status code:", output.Response.StatusCode())
fmt.Println("Content-Type", output.Response.Header().Get("Content-Type"))
fmt.Println("Data:", output.Data)

Raw Rules ~ JSON

{
	"Method": "string",
	"URL": "string",
	"Proxy": "string",
	"Header": {
		"string": "string",
		"string": ["string", "string", ...]
	},
	"Timeout": "number_millisecond",
	"Cookies": "bool",
	"IgnoreRobotsTxt": "bool",
	"Delay": "number_millisecond",
	"Redirects": "number",
	"Selectors": {...}
}

Selectors

{
	"Selectors": {
		"key_name": "expression"
	}
}

{
	"Selectors": {
		"title": "//head/title"
	}
}

{
	"Selectors": {
		"key_name":  {
			"Expr": "expression",
			"Type": "expression_type",
			"All": "bool",
			"Follow": "bool",
			"Method": "string",
			"Header": {...},
			"Proxy": "string",
			"Timeout": "number_millisecond",
			"Selectors": {...}
		}
	}
}

{
	"Selectors": {
		"title":  {
			"Expr": "//head/title",
			"Type": "xpath"
		}
	}
}

Nested selectors

{
	"Selectors": {
		"body":  {
			"Expr": "//body",
			"Type": "xpath",
			"Selectors": {
				"p": "//p"
			}
		}
	}
}

Find all

{
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
		}
	}
}

Follow URLs

{
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
			"Follow": true,
			"Selectors": {
				"title": "//head/title"
			}
		}
	}
}

{
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
			"Follow": true,
			"Proxy": "http://proxy-url.com:8080",
			"Cookies": true,
			"Selectors": {
				"title": "//head/title"
			}
		}
	}
}

Extra Fields

{
	"Selectors": {
		"title":  {
			"Expr": "//head/title",
			"Type": "xpath",
			
			"Required": true
		}
	}
}

Example

{
	"Method": "GET",
	"URL": "https://example.com",
	"Header": {
		"User-Agent": "test/0.1.0",
	},
	"Timeout": 5000,
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
			"Follow": true,
			"Selectors": {
				"title": "//head/title"
			}
		}
	}
}

Name		Name	Last commit message	Last commit date
Latest commit History 1 Commit
parsers		parsers
webextractor		webextractor
LICENSE		LICENSE
README.md		README.md
colibri.go		colibri.go
colibri_test.go		colibri_test.go
errs.go		errs.go
go.mod		go.mod
go.sum		go.sum
rules.go		rules.go
selector.go		selector.go

License

gonzxlez/colibri

Folders and files

Latest commit

History

Repository files navigation

Colibri

Installation

Do

Extract

Raw Rules ~ JSON

Selectors

Nested selectors

Find all

Follow URLs

Extra Fields

Example

About

Resources

License

Stars

Watchers

Forks

Languages