forked from richyen/walker
-
Notifications
You must be signed in to change notification settings - Fork 1
/
walker.yaml
192 lines (151 loc) · 7.46 KB
/
walker.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# The Walker Configuration File
#
# Every configurable Walker parameter is listed and documented here.
#
# Walker was made to run with sensible defaults. The values set here are the
# defaults. If no walker.yaml file is provided, or if keys are left out, then
# these values will be used.
#
# NOTE: Units of time (ex. http_timeout) are those understood by Go's
# time.ParseDuration call. To quote their documentation:
#
# A duration string is a possibly signed sequence of decimal numbers,
# each with optional fraction and a unit suffix, such as "300ms", "-1.5h"
# or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m",
# "h".
#
# Note that hour, 'h', is the largest time unit supported.
# Fetcher configuration
fetcher:
# Maximum number of entries to hold when we cache domain name resolutions
max_dns_cache_entries: 20000
# Configure the User-Agent header
user_agent: Walker (http://github.com/iParadigms/walker)
# Configure which formats this crawler Accepts
accept_formats: ["text/html", "text/*"]
# Which link to accept based on protocol (a.k.a. schema)
accept_protocols: ["http", "https"]
# Maximum size of http content
max_http_content_size_bytes: 20971520 # 20MB
# For the purpose of parsing out links for crawling, walker looks at the
# following tags:
# - a, area, form, frame, iframe, script, link, img, object, embed, and meta
# It ignores several by default.
ignore_tags: [script, img, link]
# The maximum number of links to parse from a page for further crawling.
max_links_per_page: 1000
# How many simultaneous fetchers will your crawlmanager run
num_simultaneous_fetchers: 10
# If true, walker will not crawl domains that resolve in private IP ranges
blacklist_private_ips: true
# The duration the the complete http-Get is allowed to run before being
# canceled. Zero indicates no timeout.
http_timeout: 30s
# If true, walker will honor the website authors
# <meta name="ROBOTS" content="noindex"> tags
honor_meta_noindex: true
# If true, walker will honor the website authors
# <meta name="ROBOTS" content="nofollow"> tags
honor_meta_nofollow: false
# A list of regex patterns to exclude from the crawl. If a link matches a
# pattern in this list, but not one in the include_link_patterns
# list, than it is excluded.
exclude_link_patterns: []
# A list of regex patterns that override excludes listed in exclude_link_patterns
include_link_patterns: []
# Crawl delay duration to use when unspecified by robots.txt.
default_crawl_delay: 1s
# Max crawl delay accepted. To compute the actual crawl delay, walker will use
# the minimum of max_crawl_delay, and the Crawl-Delay header read out of a
# site's robots.txt file.
max_crawl_delay: 5m
# List of session ids to purge from a URL during normalization. If X is in purge_sid_list,
# than both http://a.com/path;X=----- and http://a.com/path?X=---- will be turned into
# http://a.com/path
purge_sid_list: ["jsessionid", "phpsessid", "aspsessionid"]
# How long until Cassandra will expire a token on the active_fetchers table
active_fetchers_ttl: 15m
# Expert option: Controls the length of time an in-memory cache of
# actice_fetchers stays valid. This number should be greater than zero, and
# less than or equal to 1. The cache time is active_fetchers_ttl *
# active_fetchers_cacheratio.
active_fetchers_cacheratio: 0.75
# Expert option: Controls the length of time between active_fetchers Keep-
# alive updates. This number should be greater than zero, and less than 1.
# The keep-alive period is active_fetchers_ttl * active_fetchers_keepratio.
active_fetchers_keepratio: 0.75
# Controls the http Keep-Alive setting when fetching pages. Can be "always" (to always
# keep a connection alive), "never" (to never keep connection alive), and threshold (keep
# connection alive if the target sites robots.Crawl-Delay is less than http_keep_alive_threshold)
http_keep_alive: "always"
# If http_keep_alive is set to "threshold", sets the Keep-Alive policy (see above). Otherwise,
# this variable is unused.
http_keep_alive_threshold: 15s
# The maximum path length that is considered a good url. URL's with paths longer than this will be completely
# ignored: i.e. not crawled, not inserted into the datastore, etc. This variable helps users prevent cycles (where a
# web-page refers to itself, only with a slightly longer, and hence distinct, URI). Set this variable <= 0 to
# ignore URI path length.
max_path_length: 2048
# Dispatcher configuration
dispatcher:
# maximum number of links added to segments table per dispatch (must be >0)
num_links_per_segment: 500
# refresh_percentage is the percentage of links added per dispatch that have already been crawled.
# So refresh_percentage = 25 means that 25% of the links added to segments on the next dispatch
# will be refreshed (i.e. already crawled) links. This value must be >= 0 and <= 100.
refresh_percentage: 25
# How many concurrent dispatching threads will be run at once (must be >0)
num_concurrent_domains: 1
# A duration specifying the minimum amount of time that must pass between re-crawling
# a specific link.
min_link_refresh_time: 0s
# Once the dispatcher has iterated all domains and dispatched them, it will
# wait this long before iterating again.
dispatch_interval: 10s
# If this variable is true, the dispatcher will change links in the datastore that
# are not normalized (according to the current normalization configuration).
correct_link_normalization: false
# Cassandra configuration for the datastore.
# Generally these are used to create a gocql.ClusterConfig object
# (https://godoc.org/github.com/gocql/gocql#ClusterConfig).
#
cassandra:
hosts: ["localhost"]
timeout: 2s
cql_version: "3.0.0"
proto_version: 2
port: 9042
num_conns: 2
num_streams: 128
discover_hosts: false
max_prepared_stmts: 1000
# keyspace shouldn't generally need to be changed; it is mainly changed in
# testing as an extra layer of safety.
keyspace: "walker"
# replication_factor is used when defining the initial keyspace.
# For production clusters we recommend 3 replicas.
replication_factor: 1
# Whether to dynamically add new-found domains (or their links) to the crawl (a
# broad crawl) or discard them, assuming desired domains are manually seeded.
add_new_domains: false
# The number of entries to keep in the cassandra datastore's LRU cache of
# domains, preventing us from querying too frequently to see if we already have
# them.
added_domains_cache_size: 20000
# If this is set to true, walker will store the body of the HTTP request along
# with the link.
store_response_body: false
# If this is set to true, walker will store the HTTP headers of the request along
# with the link.
store_response_headers: false
# How many times to retry a cassandra query before the query resolves in error
num_query_retries: 3
# The priority new domains will be added with.
default_domain_priority: 1
# Console specific config
console:
port: 3000
template_directory: console/templates
public_folder: console/public
# The maximum priority that console will accept when configuring domain priority. Set this <= 0 to have no maximum
max_allowed_domain_priority: 100