/
options.go
147 lines (126 loc) · 3.6 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/*
* @Author: thepoy
* @Email: thepoy@163.com
* @File Name: options.go
* @Created: 2021-07-23 08:58:31
* @Modified: 2022-02-25 10:53:26
*/
package predator
import (
"crypto/tls"
"strings"
"sync"
"github.com/go-predator/cache"
"github.com/go-predator/predator/log"
)
type CrawlerOption func(*Crawler)
// SkipVerification will skip verifying the certificate when
// you access the `https` protocol
func SkipVerification() CrawlerOption {
return func(c *Crawler) {
c.client.TLSConfig = &tls.Config{InsecureSkipVerify: true}
}
}
func WithLogger(logger *log.Logger) CrawlerOption {
if logger == nil {
logger = log.NewLogger(log.WARNING, log.ToConsole(), 1)
}
return func(c *Crawler) {
c.log = logger
}
}
func WithDefaultLogger() CrawlerOption {
return WithLogger(nil)
}
func WithUserAgent(ua string) CrawlerOption {
return func(c *Crawler) {
c.UserAgent = ua
}
}
func WithRawCookie(cookie string) CrawlerOption {
cookies := make(map[string]string)
cookieSlice := strings.Split(cookie, "; ")
for _, c := range cookieSlice {
temp := strings.Split(c, "=")
cookies[temp[0]] = temp[1]
}
return WithCookies(cookies)
}
func WithCookies(cookies map[string]string) CrawlerOption {
return func(c *Crawler) {
c.cookies = cookies
}
}
// WithConcurrency 使用并发,参数为要创建的协程池数量
func WithConcurrency(count uint64, blockPanic bool) CrawlerOption {
return func(c *Crawler) {
p, err := NewPool(count)
if err != nil {
panic(err)
}
p.blockPanic = blockPanic
c.goPool = p
c.wg = new(sync.WaitGroup)
}
}
type RetryConditions func(r Response) bool
// WithRetry 请求失败时重试多少次,什么条件的响应是请求失败
func WithRetry(count uint32, cond RetryConditions) CrawlerOption {
return func(c *Crawler) {
c.retryCount = count
c.retryConditions = cond
}
}
// WithProxy 使用一个代理
func WithProxy(proxyURL string) CrawlerOption {
return func(c *Crawler) {
c.proxyURLPool = []string{proxyURL}
}
}
// WithProxyPool 使用一个代理池
func WithProxyPool(proxyURLs []string) CrawlerOption {
return func(c *Crawler) {
c.proxyURLPool = proxyURLs
}
}
func WithComplementProxyPool(f ComplementProxyPool) CrawlerOption {
return func(c *Crawler) {
c.complementProxyPool = f
}
}
// WithCache 使用缓存,可以选择是否压缩缓存的响应。
// 使用缓存时,如果发出的是 POST 请求,最好传入能
// 代表请求体的唯一性的缓存字段,可以是零个、一个或多个。
//
// 注意:当不传入缓存字段时,将会默认采用整个请求体作为
// 缓存标识,但由于 map 无序,同一个请求体生成的 key 很
// 难保证相同,所以可能会有同一个请求缓存多次,或者无法
// 从缓存中读取已请求过的请求的响应的情况出现。
func WithCache(cc Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...CacheField) CrawlerOption {
return func(c *Crawler) {
cc.Compressed(compressed)
err := cc.Init()
if err != nil {
panic(err)
}
c.cache = cc
if cacheCondition == nil {
cacheCondition = func(r Response) bool {
return r.StatusCode/100 == 2
}
}
c.cacheCondition = cacheCondition
if len(cacheFileds) > 0 {
c.cacheFields = cacheFileds
}
}
}
// WithDefaultCache 默认缓存为 sqlite3,不压缩
func WithDefaultCache() CrawlerOption {
return WithCache(&cache.SQLiteCache{}, false, nil)
}
func EnableIPv6() CrawlerOption {
return func(c *Crawler) {
c.client.DialDualStack = true
}
}