Skip to content

Commit

Permalink
An attempt to make general parser based on ideas in PR #150
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Dec 29, 2020
1 parent b5675e8 commit f1a6df2
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
29 changes: 29 additions & 0 deletions lib/crawly/parsers/links_extractor.ex
@@ -0,0 +1,29 @@
defmodule Crawly.Parsers.RequestsExtractor do
@moduledoc """
Links extractor parser helper, which simplifies the process
of links extraction.
"""

alias Crawly.Spider.Parse

def parse(parse_struct) do
%Parse{response: response, selector: selector} = parse_struct

requests =
response.body
|> Floki.parse_document!()
|> Floki.find(selector)
|> Floki.attribute("href")
|> Crawly.Utils.build_absolute_urls(response.request_url)
|> Crawly.Utils.requests_from_urls()

new_parsed_requests = requests ++ parse_struct.parsed_item.requests

new_parsed_item = %Crawly.ParsedItem{
parse_struct.parsed_item
| requests: new_parsed_requests
}

%Parse{parse_struct | parsed_item: new_parsed_item}
end
end
19 changes: 19 additions & 0 deletions lib/crawly/parsers/parse.ex
@@ -0,0 +1,19 @@
defmodule Crawly.Spider.Parse do
@moduledoc """
The struct that is piped through a spider's declared list of parsers (that implements the pipeline behaviour) as the parse state.
The response is loaded into this struct and piped through a parse pipeline if the `:parse` setting key is set.
"""

defstruct response: nil,
spider_name: nil,
selector: nil,
parsed_item: %Crawly.ParsedItem{}

@type t :: %__MODULE__{
spider_name: atom(),
response: Crawly.Response.t(),
selector: String.t(),
parsed_item: %Crawly.ParsedItem{}
}
end

0 comments on commit f1a6df2

Please sign in to comment.