An attempt to make general parser based on ideas in PR #150

elixir-crawly · Dec 29, 2020 · f1a6df2 · f1a6df2
1 parent b5675e8
commit f1a6df2
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 0 deletions.
diff --git a/lib/crawly/parsers/links_extractor.ex b/lib/crawly/parsers/links_extractor.ex
@@ -0,0 +1,29 @@
+defmodule Crawly.Parsers.RequestsExtractor do
+  @moduledoc """
+  Links extractor parser helper, which simplifies the process
+  of links extraction.
+  """
+
+  alias Crawly.Spider.Parse
+
+  def parse(parse_struct) do
+    %Parse{response: response, selector: selector} = parse_struct
+
+    requests =
+      response.body
+      |> Floki.parse_document!()
+      |> Floki.find(selector)
+      |> Floki.attribute("href")
+      |> Crawly.Utils.build_absolute_urls(response.request_url)
+      |> Crawly.Utils.requests_from_urls()
+
+    new_parsed_requests = requests ++ parse_struct.parsed_item.requests
+
+    new_parsed_item = %Crawly.ParsedItem{
+      parse_struct.parsed_item
+      | requests: new_parsed_requests
+    }
+
+    %Parse{parse_struct | parsed_item: new_parsed_item}
+  end
+end
diff --git a/lib/crawly/parsers/parse.ex b/lib/crawly/parsers/parse.ex
@@ -0,0 +1,19 @@
+defmodule Crawly.Spider.Parse do
+  @moduledoc """
+  The struct that is piped through a spider's declared list of parsers  (that implements the pipeline behaviour) as the parse state.
+
+  The response is loaded into this struct and piped through a parse pipeline if the `:parse` setting key is set.
+  """
+
+  defstruct response: nil,
+            spider_name: nil,
+            selector: nil,
+            parsed_item: %Crawly.ParsedItem{}
+
+  @type t :: %__MODULE__{
+          spider_name: atom(),
+          response: Crawly.Response.t(),
+          selector: String.t(),
+          parsed_item: %Crawly.ParsedItem{}
+        }
+end