walker.yaml

# The Walker Configuration File
#
# Every configurable Walker parameter is listed and documented here.
#
# Walker was made to run with sensible defaults. The values set here are the
# defaults. If no walker.yaml file is provided, or if keys are left out, then
# these values will be used.
#
# NOTE: Units of time (ex. http_timeout) are those understood by Go's
# time.ParseDuration call. To quote their documentation:
#
#   A duration string is a possibly signed sequence of decimal numbers,
#   each with optional fraction and a unit suffix, such as "300ms", "-1.5h"
#   or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m",
#   "h".
#
# Note that hour, 'h', is the largest time unit supported.

# Fetcher configuration
fetcher:
    # Maximum number of entries to hold when we cache domain name resolutions
    max_dns_cache_entries: 20000

    # Configure the User-Agent header
    user_agent: Walker (http://github.com/iParadigms/walker)

    # Configure which formats this crawler Accepts
    accept_formats: ["text/html", "text/*"]

    # Which link to accept based on protocol (a.k.a. schema)
    accept_protocols: ["http", "https"]

    # Maximum size of http content
    max_http_content_size_bytes: 20971520 # 20MB

    # For the purpose of parsing out links for crawling, walker looks at the
    # following tags:
    #   - a, area, form, frame, iframe, script, link, img, object, embed, and meta
    # It ignores several by default.
    ignore_tags: [script, img, link]

    # The maximum number of links to parse from a page for further crawling.
    max_links_per_page: 1000

    # How many simultaneous fetchers will your crawlmanager run
    num_simultaneous_fetchers: 10

    # If true, walker will not crawl domains that resolve in private IP ranges
    blacklist_private_ips: true

    # The duration the the complete http-Get is allowed to run before being
    # canceled. Zero indicates no timeout.
    http_timeout: 30s

    # If true, walker will honor the website authors 
    # <meta name="ROBOTS" content="noindex"> tags
    honor_meta_noindex: true

    # If true, walker will honor the website authors 
    # <meta name="ROBOTS" content="nofollow"> tags
    honor_meta_nofollow: false

    # A list of regex patterns to exclude from the crawl. If a link matches a
    # pattern in this list, but not one in the include_link_patterns
    # list, than it is excluded.
    exclude_link_patterns: []

    # A list of regex patterns that override excludes listed in exclude_link_patterns
    include_link_patterns: []

    # Crawl delay duration to use when unspecified by robots.txt. 
    default_crawl_delay: 1s

    # Max crawl delay accepted. To compute the actual crawl delay, walker will use
    # the minimum of max_crawl_delay, and the Crawl-Delay header read out of a
    # site's robots.txt file.
    max_crawl_delay: 5m

    # List of session ids to purge from a URL during normalization. If X is in purge_sid_list,
    # than both http://a.com/path;X=----- and http://a.com/path?X=---- will be turned into
    # http://a.com/path
    purge_sid_list: ["jsessionid", "phpsessid", "aspsessionid"]

    # How long until Cassandra will expire a token on the active_fetchers table
    active_fetchers_ttl: 15m

    # Expert option: Controls the length of time an in-memory cache of
    # actice_fetchers stays valid. This number should be greater than zero, and
    # less than or equal to 1. The cache time is active_fetchers_ttl *
    # active_fetchers_cacheratio. 
    active_fetchers_cacheratio: 0.75

    # Expert option: Controls the length of time between active_fetchers Keep-
    # alive updates. This number should be greater than zero, and less than 1.
    # The keep-alive period is active_fetchers_ttl * active_fetchers_keepratio.
    active_fetchers_keepratio: 0.75

    # Controls the http Keep-Alive setting when fetching pages. Can be "always" (to always
    # keep a connection alive), "never" (to never keep connection alive), and threshold (keep
    # connection alive if the target sites robots.Crawl-Delay is less than http_keep_alive_threshold)
    http_keep_alive: "always"

    # If http_keep_alive is set to "threshold", sets the Keep-Alive policy (see above). Otherwise,
    # this variable is unused.
    http_keep_alive_threshold: 15s

    # The maximum path length that is considered a good url. URL's with paths longer than this will be completely
    # ignored: i.e. not crawled, not inserted into the datastore, etc. This variable helps users prevent cycles (where a
    # web-page refers to itself, only with a slightly longer, and hence  distinct, URI). Set this variable <= 0 to
    # ignore URI path length.
    max_path_length: 2048

# Dispatcher configuration
dispatcher:
    # maximum number of links added to segments table per dispatch (must be >0)
    num_links_per_segment: 500

    # refresh_percentage is the percentage of links added per dispatch that have already been crawled.
    # So refresh_percentage = 25 means that 25% of the links added to segments on the next dispatch
    # will be refreshed (i.e. already crawled) links. This value must be >= 0 and <= 100.
    refresh_percentage: 25

    # How many concurrent dispatching threads will be run at once (must be >0)
    num_concurrent_domains: 1

    # A duration specifying the minimum amount of time that must pass between re-crawling 
    # a specific link.
    min_link_refresh_time: 0s

    # Once the dispatcher has iterated all domains and dispatched them, it will
    # wait this long before iterating again.
    dispatch_interval: 10s

    # If this variable is true, the dispatcher will change links in the datastore that
    # are not normalized (according to the current normalization configuration).
    correct_link_normalization: false

# Cassandra configuration for the datastore.
# Generally these are used to create a gocql.ClusterConfig object
# (https://godoc.org/github.com/gocql/gocql#ClusterConfig).
#
cassandra:
    hosts: ["localhost"]
    timeout: 2s
    cql_version: "3.0.0"
    proto_version: 2
    port: 9042
    num_conns: 2
    num_streams: 128
    discover_hosts: false
    max_prepared_stmts: 1000

    # keyspace shouldn't generally need to be changed; it is mainly changed in
    # testing as an extra layer of safety.
    keyspace: "walker"

    # replication_factor is used when defining the initial keyspace.
    # For production clusters we recommend 3 replicas.
    replication_factor: 1

    # Whether to dynamically add new-found domains (or their links) to the crawl (a
    # broad crawl) or discard them, assuming desired domains are manually seeded.
    add_new_domains: false

    # The number of entries to keep in the cassandra datastore's LRU cache of
    # domains, preventing us from querying too frequently to see if we already have
    # them.
    added_domains_cache_size: 20000

    # If this is set to true, walker will store the body of the HTTP request along 
    # with the link.
    store_response_body: false

    # If this is set to true, walker will store the HTTP headers of the request along 
    # with the link.
    store_response_headers: false

    # How many times to retry a cassandra query before the query resolves in error
    num_query_retries: 3

    # The priority new domains will be added with.
    default_domain_priority: 1

# Console specific config
console:
    port: 3000
    template_directory: console/templates
    public_folder: console/public

    # The maximum priority that console will accept when configuring domain priority. Set this <= 0 to have no maximum
    max_allowed_domain_priority: 100