diff --git a/lib/crawly/parsers/links_extractor.ex b/lib/crawly/parsers/links_extractor.ex new file mode 100644 index 00000000..21fe9910 --- /dev/null +++ b/lib/crawly/parsers/links_extractor.ex @@ -0,0 +1,29 @@ +defmodule Crawly.Parsers.RequestsExtractor do + @moduledoc """ + Links extractor parser helper, which simplifies the process + of links extraction. + """ + + alias Crawly.Spider.Parse + + def parse(parse_struct) do + %Parse{response: response, selector: selector} = parse_struct + + requests = + response.body + |> Floki.parse_document!() + |> Floki.find(selector) + |> Floki.attribute("href") + |> Crawly.Utils.build_absolute_urls(response.request_url) + |> Crawly.Utils.requests_from_urls() + + new_parsed_requests = requests ++ parse_struct.parsed_item.requests + + new_parsed_item = %Crawly.ParsedItem{ + parse_struct.parsed_item + | requests: new_parsed_requests + } + + %Parse{parse_struct | parsed_item: new_parsed_item} + end +end diff --git a/lib/crawly/parsers/parse.ex b/lib/crawly/parsers/parse.ex new file mode 100644 index 00000000..f144ecaa --- /dev/null +++ b/lib/crawly/parsers/parse.ex @@ -0,0 +1,19 @@ +defmodule Crawly.Spider.Parse do + @moduledoc """ + The struct that is piped through a spider's declared list of parsers (that implements the pipeline behaviour) as the parse state. + + The response is loaded into this struct and piped through a parse pipeline if the `:parse` setting key is set. + """ + + defstruct response: nil, + spider_name: nil, + selector: nil, + parsed_item: %Crawly.ParsedItem{} + + @type t :: %__MODULE__{ + spider_name: atom(), + response: Crawly.Response.t(), + selector: String.t(), + parsed_item: %Crawly.ParsedItem{} + } +end