Skip to content

gonzxlez/colibri

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

1 Commit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Colibri

Colibri is an extensible web crawling and scraping framework for Go, used to crawl and extract structured data on the web.

See webextractor.

Installation

 $ go get github.com/gonzxlez/colibri

Do

// Do makes an HTTP request based on the rules.
func (c *Colibri) Do(rules *Rules) (resp Response, err error) 
var rawRules = []byte(`{...}`) // Raw Rules ~ JSON 

c := colibri.New()
c.Client = ...    // Required
c.Delay = ...     // Optional
c.RobotsTxt = ... // Optional
c.Parser = ...    // Optional

var rules colibri.Rules
err := json.Unmarshal(rawRules, &rules)
if err != nil {
	panic(err)
} 

resp, err := c.Do(rules)
if err != nil {
	panic(err)
}

fmt.Println("URL:", resp.URL())
fmt.Println("Status code:", resp.StatusCode())
fmt.Println("Content-Type", resp.Header().Get("Content-Type"))

Extract

// Extract makes the HTTP request and parses the content of the response based on the rules.
func (c *Colibri) Extract(rules *Rules) (output *Output, err error)
var rawRules = []byte(`{...}`) // Raw Rules ~ JSON 

c := colibri.New()
c.Client = ...    // Required
c.Delay = ...     // Optional
c.RobotsTxt = ... // Optional
c.Parser = ...    // Required

var rules colibri.Rules
err := json.Unmarshal(rawRules, &rules)
if err != nil {
	panic(err)
} 

output, err := c.Extract(&rules)
if err != nil {
	panic(err)
}

fmt.Println("URL:", output.Response.URL())
fmt.Println("Status code:", output.Response.StatusCode())
fmt.Println("Content-Type", output.Response.Header().Get("Content-Type"))
fmt.Println("Data:", output.Data)

Raw Rules ~ JSON

{
	"Method": "string",
	"URL": "string",
	"Proxy": "string",
	"Header": {
		"string": "string",
		"string": ["string", "string", ...]
	},
	"Timeout": "number_millisecond",
	"Cookies": "bool",
	"IgnoreRobotsTxt": "bool",
	"Delay": "number_millisecond",
	"Redirects": "number",
	"Selectors": {...}
}

Selectors

{
	"Selectors": {
		"key_name": "expression"
	}
}
{
	"Selectors": {
		"title": "//head/title"
	}
}
{
	"Selectors": {
		"key_name":  {
			"Expr": "expression",
			"Type": "expression_type",
			"All": "bool",
			"Follow": "bool",
			"Method": "string",
			"Header": {...},
			"Proxy": "string",
			"Timeout": "number_millisecond",
			"Selectors": {...}
		}
	}
}
{
	"Selectors": {
		"title":  {
			"Expr": "//head/title",
			"Type": "xpath"
		}
	}
}

Nested selectors

{
	"Selectors": {
		"body":  {
			"Expr": "//body",
			"Type": "xpath",
			"Selectors": {
				"p": "//p"
			}
		}
	}
}

Find all

{
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
		}
	}
}

Follow URLs

{
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
			"Follow": true,
			"Selectors": {
				"title": "//head/title"
			}
		}
	}
}
{
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
			"Follow": true,
			"Proxy": "http://proxy-url.com:8080",
			"Cookies": true,
			"Selectors": {
				"title": "//head/title"
			}
		}
	}
}

Extra Fields

{
	"Selectors": {
		"title":  {
			"Expr": "//head/title",
			"Type": "xpath",
			
			"Required": true
		}
	}
}

Example

{
	"Method": "GET",
	"URL": "https://example.com",
	"Header": {
		"User-Agent": "test/0.1.0",
	},
	"Timeout": 5000,
	"Selectors": {
		"a":  {
			"Expr": "//body/a",
			"Type": "xpath",
			"All": true,
			"Follow": true,
			"Selectors": {
				"title": "//head/title"
			}
		}
	}
}

About

Colibri is an extensible web crawling and scraping framework for Go, used to crawl and extract structured data on the web.

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages