-
Notifications
You must be signed in to change notification settings - Fork 109
/
crawly.ex
137 lines (114 loc) · 4.32 KB
/
crawly.ex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
defmodule Crawly do
@moduledoc """
Crawly is a fast high-level web crawling & scraping framework for Elixir.
"""
@doc """
Fetches a given url. This function is mainly used for the spiders development
when you need to get individual pages and parse them.
The fetched URL is being converted to a request, and the request is piped
through the middlewares specified in a config (with the exception of
`Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`)
Provide a spider with the `:with` option to fetch a given webpage using that spider.
### Fetching with a spider
To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option.
iex> Crawly.fetch("https://www.example.com", with: MySpider)
{%HTTPoison.Response{...}, %{...}, [...], %{...}}
Using the `:with` option will return a 4 item tuple:
1. The HTTPoison response
2. The result returned from the `parse_item/1` callback
3. The list of items that have been processed by the declared item pipelines.
4. The pipeline state, included for debugging purposes.
"""
@type with_opt :: {:with, nil | module()}
@type request_opt :: {:request_options, list(Crawly.Request.option())}
@type headers_opt :: {:headers, list(Crawly.Request.header())}
@type parsed_item_result :: Crawly.ParsedItem.t()
@type parsed_items :: list(any())
@type pipeline_state :: %{optional(atom()) => any()}
@spec fetch(url, opts) ::
HTTPoison.Response.t()
| {HTTPoison.Response.t(), parsed_item_result, parsed_items,
pipeline_state}
when url: binary(),
opts: [
with_opt
| request_opt
| headers_opt
]
def fetch(url, opts \\ []) do
opts = Enum.into(opts, %{with: nil, request_options: [], headers: []})
request0 =
Crawly.Request.new(url, opts[:headers], opts[:request_options])
|> Map.put(
:middlewares,
Crawly.Utils.get_settings(:middlewares, opts[:with], [])
)
ignored_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.RobotsTxt
]
new_middlewares = request0.middlewares -- ignored_middlewares
request0 =
Map.put(
request0,
:middlewares,
new_middlewares
)
{%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{})
{fetcher, client_options} =
Crawly.Utils.get_settings(
:fetcher,
opts[:with],
{Crawly.Fetchers.HTTPoisonFetcher, []}
)
{:ok, response} = fetcher.fetch(request, client_options)
case opts[:with] do
nil ->
# no spider provided, return response as is
response
_ ->
# spider provided, send response through parse_item callback, pipe through the pipelines
with parsed_result <- parse(response, opts[:with]),
pipelines <-
Crawly.Utils.get_settings(
:pipelines,
opts[:with]
),
items <- Map.get(parsed_result, :items, []),
{pipeline_result, pipeline_state} <-
Enum.reduce(items, {[], %{}}, fn item, {acc, state} ->
{piped, state} = Crawly.Utils.pipe(pipelines, item, state)
if piped == false do
# dropped
{acc, state}
else
{[piped | acc], state}
end
end) do
{response, parsed_result, pipeline_result, pipeline_state}
end
end
end
@doc """
Parses a given response with a given spider. Allows to quickly see the outcome
of the given :parse_item implementation.
"""
@spec parse(response, spider) :: {:ok, result}
when response: Crawly.Response.t(),
spider: atom(),
result: Crawly.ParsedItem.t()
def parse(response, spider) do
case Kernel.function_exported?(spider, :parse_item, 1) do
false ->
{:error, :spider_not_found}
true ->
spider.parse_item(response)
end
end
@doc """
Returns a list of known modules which implements Crawly.Spider behaviour.
Should not be used for spider management. Use functions defined in `Crawly.Engine` for that.
"""
@spec list_spiders() :: [module()]
def list_spiders(), do: Crawly.Utils.list_spiders()
end