From aeaeef74cbddfd5f3a7b4fa1a25a2de645c956ce Mon Sep 17 00:00:00 2001 From: ziinc Date: Mon, 18 May 2020 17:49:05 +0800 Subject: [PATCH 01/10] Added with option, moved all optional parameters to list opt --- lib/crawly.ex | 77 +++++++++++++++++++++++++++++++++++--------- test/crawly_test.exs | 35 ++++++++++++++++++-- 2 files changed, 94 insertions(+), 18 deletions(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index 930de76a..d23f42cc 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -9,32 +9,74 @@ defmodule Crawly do The fetched URL is being converted to a request, and the request is piped through the middlewares specidied in a config (with the exception of - `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are - ignored) + `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`) + + Provide a spider with the `:with` option to fetch a given webpage using that spider. + """ - @spec fetch(url, headers, options) :: HTTPoison.Response.t() + + @spec fetch(url, opts) :: HTTPoison.Response.t() when url: binary(), - headers: [], - options: [] - def fetch(url, headers \\ [], options \\ []) do - request0 = Crawly.Request.new(url, headers, options) + opts: list() + def fetch(url, opts \\ []) do + opts = Enum.into(opts, %{with: nil, request_options: [], headers: []}) + + request0 = + Crawly.Request.new(url, opts[:headers], opts[:request_options]) + |> Map.put( + :middlewares, + Crawly.Utils.get_settings(:middlewares, opts[:with], []) + ) + ignored_middlewares = [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.RobotsTxt ] - middlewares = request0.middlewares -- ignored_middlewares - {request, _} = Crawly.Utils.pipe(middlewares, request0, %{}) + new_middlewares = request0.middlewares -- ignored_middlewares + + request0 = + Map.put( + request0, + :middlewares, + new_middlewares + ) + + {%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{}) - {fetcher, client_options} = Application.get_env( - :crawly, - :fetcher, - {Crawly.Fetchers.HTTPoisonFetcher, []} - ) + {fetcher, client_options} = + Crawly.Utils.get_settings( + :fetcher, + opts[:with], + {Crawly.Fetchers.HTTPoisonFetcher, []} + ) {:ok, response} = fetcher.fetch(request, client_options) - response + + case opts[:with] do + nil -> + # no spider provided, return response as is + response + + _ -> + # spider provided, send response through parse_item callback, pipe through the pipelines + with parsed_result <- parse(response, opts[:with]), + pipelines <- + Crawly.Utils.get_settings( + :pipelines, + opts[:with] + ), + items <- Map.get(parsed_result, :items, []), + pipeline_result <- + Enum.reduce(items, [], fn item, acc -> + {piped, _state} = Crawly.Utils.pipe(pipelines, item, %{}) + + [acc | piped] + end) do + {response, parsed_result, pipeline_result} + end + end end @doc """ @@ -49,13 +91,16 @@ defmodule Crawly do case Kernel.function_exported?(spider, :parse_item, 1) do false -> {:error, :spider_not_found} + true -> spider.parse_item(response) end end @doc """ - Returns a list of known modules which implements Crawly.Spider behaviour + Returns a list of known modules which implements Crawly.Spider behaviour. + + Should not be used for spider management. Use functions defined in `Crawly.Engine` for that. """ @spec list_spiders() :: [module()] def list_spiders(), do: Crawly.Utils.list_spiders() diff --git a/test/crawly_test.exs b/test/crawly_test.exs index b9f6b33b..a45a8414 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -2,7 +2,38 @@ defmodule CrawlyTest do use ExUnit.Case doctest Crawly - test "greets the world" do - assert :test == :test + setup do + :meck.new(CrawlyTestSpider) + + :meck.expect(CrawlyTestSpider, :parse_items, fn resp -> + %{ + items: ["hello"], + requests: [ + Crawly.Utils.request_from_url("https://www.example.com/test") + ] + } + end) + + on_exit(fn -> + :meck.unload(CrawlyTestSpider) + end) + end + + test "fetch/1 is able to fetch a given url using global config, returns a response" do + assert %HTTPoison.Response{} = Crawly.fetch("https://example.com") + end + + test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems" do + assert {%HTTPoison.Response{}, parsed_items_res, parsed_items} = + Crawly.fetch("http://example.com", with: CrawlyTestSpider) + + assert %{ + items: items, + requests: requests + } = parsed_items_res + + assert is_list(parsed_items) + assert length(parsed_items) == 1 + assert ["hello"] = parsed_items end end From 7944ec2962c1e0ad5c7565c2dddf0a26b1be3bc6 Mon Sep 17 00:00:00 2001 From: Ziinc Date: Mon, 2 Nov 2020 14:01:02 +0800 Subject: [PATCH 02/10] fixed failing test, added mock for settings override in mocked spider --- lib/crawly.ex | 26 +++++++++++++++++++------- test/crawly_test.exs | 31 +++++++++++++++++++------------ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index d23f42cc..d11f4ff3 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -15,10 +15,17 @@ defmodule Crawly do """ + @type with_opt :: {:with, nil | module()} + @type request_opt :: {:request_options, list(Crawly.Request.option())} + @type headers_opt :: {:headers, list(Crawly.Request.header())} @spec fetch(url, opts) :: HTTPoison.Response.t() when url: binary(), - opts: list() + opts: [ + with_opt + | request_opt + | headers_opt + ] def fetch(url, opts \\ []) do opts = Enum.into(opts, %{with: nil, request_options: [], headers: []}) @@ -68,13 +75,18 @@ defmodule Crawly do opts[:with] ), items <- Map.get(parsed_result, :items, []), - pipeline_result <- - Enum.reduce(items, [], fn item, acc -> - {piped, _state} = Crawly.Utils.pipe(pipelines, item, %{}) - - [acc | piped] + {pipeline_result, pipeline_state} <- + Enum.reduce(items, {[], %{}}, fn item, {acc, state} -> + {piped, state} = Crawly.Utils.pipe(pipelines, item, state) + + if piped == false do + # dropped + {acc, state} + else + {[piped | acc], state} + end end) do - {response, parsed_result, pipeline_result} + {response, parsed_result, pipeline_result, pipeline_state} end end end diff --git a/test/crawly_test.exs b/test/crawly_test.exs index a45a8414..f6d30d01 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -3,37 +3,44 @@ defmodule CrawlyTest do doctest Crawly setup do - :meck.new(CrawlyTestSpider) + :meck.new(CrawlyTestSpider, [:non_strict]) - :meck.expect(CrawlyTestSpider, :parse_items, fn resp -> + :meck.expect(CrawlyTestSpider, :parse_item, fn _resp -> %{ - items: ["hello"], + items: [%{content: "hello"}], requests: [ Crawly.Utils.request_from_url("https://www.example.com/test") ] } end) + :meck.expect(CrawlyTestSpider, :override_settings, fn -> + [pipelines: [Crawly.Pipelines.JSONEncoder]] + end) + on_exit(fn -> - :meck.unload(CrawlyTestSpider) + :meck.unload() end) + + {:ok, spider_module: CrawlyTestSpider} end test "fetch/1 is able to fetch a given url using global config, returns a response" do assert %HTTPoison.Response{} = Crawly.fetch("https://example.com") end - test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems" do - assert {%HTTPoison.Response{}, parsed_items_res, parsed_items} = - Crawly.fetch("http://example.com", with: CrawlyTestSpider) + test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems", + %{spider_module: spider_module} do + assert {%HTTPoison.Response{}, parsed_item_res, parsed_items, + pipeline_state} = + Crawly.fetch("http://example.com", with: spider_module) assert %{ - items: items, + items: [_], requests: requests - } = parsed_items_res + } = parsed_item_res - assert is_list(parsed_items) - assert length(parsed_items) == 1 - assert ["hello"] = parsed_items + assert [encoded] = parsed_items + assert encoded =~ "hello" end end From beebfeea677b6024f89a62a7acccadb1d1d7da90 Mon Sep 17 00:00:00 2001 From: Ziinc Date: Mon, 2 Nov 2020 14:09:06 +0800 Subject: [PATCH 03/10] added typespecs and docs --- lib/crawly.ex | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index d11f4ff3..7eff4973 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -13,13 +13,31 @@ defmodule Crawly do Provide a spider with the `:with` option to fetch a given webpage using that spider. + ### Fetching with a spider + To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option. + iex> Crawly.fetch("https://www.example.com", with: MySpider) + {%HTTPoison.Response{...}, %{...}, [...], %{...}} + + Using the `:with` option will return a 4 item tuple: + + 1. The HTTPoison response + 2. The result returned from the `parse_item/1` callback + 3. The list of items that have been processed by the declared item pipelines. + 4. The pipeline state, included for debugging purposes. """ @type with_opt :: {:with, nil | module()} @type request_opt :: {:request_options, list(Crawly.Request.option())} @type headers_opt :: {:headers, list(Crawly.Request.header())} - @spec fetch(url, opts) :: HTTPoison.Response.t() + @type parsed_item_result :: Crawly.ParsedItem.t() + @type parsed_items :: list(any()) + @type pipeline_state :: %{optional(atom()) => any()} + + @spec fetch(url, opts) :: + HTTPoison.Response.t() + | {HTTPoison.Response.t(), parsed_item_result, parsed_items, + pipeline_state} when url: binary(), opts: [ with_opt From a3af9b8615ed8fb19a183c8b7165ae4f70029a30 Mon Sep 17 00:00:00 2001 From: Ziinc Date: Mon, 2 Nov 2020 14:09:41 +0800 Subject: [PATCH 04/10] corrected typo in docs --- lib/crawly.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index 7eff4973..fae97481 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -8,7 +8,7 @@ defmodule Crawly do when you need to get individual pages and parse them. The fetched URL is being converted to a request, and the request is piped - through the middlewares specidied in a config (with the exception of + through the middlewares specified in a config (with the exception of `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`) Provide a spider with the `:with` option to fetch a given webpage using that spider. From 675e1c42d61890ac637e23427688b2ed085c8cbc Mon Sep 17 00:00:00 2001 From: Ziinc Date: Wed, 11 Nov 2020 23:11:16 +0800 Subject: [PATCH 05/10] remove doctest execution for Crawly module --- test/crawly_test.exs | 1 - 1 file changed, 1 deletion(-) diff --git a/test/crawly_test.exs b/test/crawly_test.exs index f6d30d01..7dbd86db 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -1,6 +1,5 @@ defmodule CrawlyTest do use ExUnit.Case - doctest Crawly setup do :meck.new(CrawlyTestSpider, [:non_strict]) From 3f7d0f44cf3c85a34cf63b5195119a671ad4a125 Mon Sep 17 00:00:00 2001 From: ziinc Date: Mon, 18 May 2020 17:49:05 +0800 Subject: [PATCH 06/10] Added with option, moved all optional parameters to list opt --- lib/crawly.ex | 67 +++++++++++++++++++++++++++++++++++--------- test/crawly_test.exs | 35 +++++++++++++++++++++-- 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index 07b07f20..d23f42cc 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -9,35 +9,74 @@ defmodule Crawly do The fetched URL is being converted to a request, and the request is piped through the middlewares specidied in a config (with the exception of - `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are - ignored) + `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`) + + Provide a spider with the `:with` option to fetch a given webpage using that spider. + """ - @spec fetch(url, headers, options) :: HTTPoison.Response.t() + + @spec fetch(url, opts) :: HTTPoison.Response.t() when url: binary(), - headers: [], - options: [] - def fetch(url, headers \\ [], options \\ []) do - request0 = Crawly.Request.new(url, headers, options) + opts: list() + def fetch(url, opts \\ []) do + opts = Enum.into(opts, %{with: nil, request_options: [], headers: []}) + + request0 = + Crawly.Request.new(url, opts[:headers], opts[:request_options]) + |> Map.put( + :middlewares, + Crawly.Utils.get_settings(:middlewares, opts[:with], []) + ) ignored_middlewares = [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.RobotsTxt ] - middlewares = request0.middlewares -- ignored_middlewares + new_middlewares = request0.middlewares -- ignored_middlewares + + request0 = + Map.put( + request0, + :middlewares, + new_middlewares + ) - {request, _} = Crawly.Utils.pipe(middlewares, request0, %{}) + {%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{}) {fetcher, client_options} = - Application.get_env( - :crawly, + Crawly.Utils.get_settings( :fetcher, + opts[:with], {Crawly.Fetchers.HTTPoisonFetcher, []} ) {:ok, response} = fetcher.fetch(request, client_options) - response + + case opts[:with] do + nil -> + # no spider provided, return response as is + response + + _ -> + # spider provided, send response through parse_item callback, pipe through the pipelines + with parsed_result <- parse(response, opts[:with]), + pipelines <- + Crawly.Utils.get_settings( + :pipelines, + opts[:with] + ), + items <- Map.get(parsed_result, :items, []), + pipeline_result <- + Enum.reduce(items, [], fn item, acc -> + {piped, _state} = Crawly.Utils.pipe(pipelines, item, %{}) + + [acc | piped] + end) do + {response, parsed_result, pipeline_result} + end + end end @doc """ @@ -59,7 +98,9 @@ defmodule Crawly do end @doc """ - Returns a list of known modules which implements Crawly.Spider behaviour + Returns a list of known modules which implements Crawly.Spider behaviour. + + Should not be used for spider management. Use functions defined in `Crawly.Engine` for that. """ @spec list_spiders() :: [module()] def list_spiders(), do: Crawly.Utils.list_spiders() diff --git a/test/crawly_test.exs b/test/crawly_test.exs index b9f6b33b..a45a8414 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -2,7 +2,38 @@ defmodule CrawlyTest do use ExUnit.Case doctest Crawly - test "greets the world" do - assert :test == :test + setup do + :meck.new(CrawlyTestSpider) + + :meck.expect(CrawlyTestSpider, :parse_items, fn resp -> + %{ + items: ["hello"], + requests: [ + Crawly.Utils.request_from_url("https://www.example.com/test") + ] + } + end) + + on_exit(fn -> + :meck.unload(CrawlyTestSpider) + end) + end + + test "fetch/1 is able to fetch a given url using global config, returns a response" do + assert %HTTPoison.Response{} = Crawly.fetch("https://example.com") + end + + test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems" do + assert {%HTTPoison.Response{}, parsed_items_res, parsed_items} = + Crawly.fetch("http://example.com", with: CrawlyTestSpider) + + assert %{ + items: items, + requests: requests + } = parsed_items_res + + assert is_list(parsed_items) + assert length(parsed_items) == 1 + assert ["hello"] = parsed_items end end From 0d0f87a9ab5a16259c1adad5b2a853f32b36be17 Mon Sep 17 00:00:00 2001 From: Ziinc Date: Mon, 2 Nov 2020 14:01:02 +0800 Subject: [PATCH 07/10] fixed failing test, added mock for settings override in mocked spider --- lib/crawly.ex | 26 +++++++++++++++++++------- test/crawly_test.exs | 31 +++++++++++++++++++------------ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index d23f42cc..d11f4ff3 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -15,10 +15,17 @@ defmodule Crawly do """ + @type with_opt :: {:with, nil | module()} + @type request_opt :: {:request_options, list(Crawly.Request.option())} + @type headers_opt :: {:headers, list(Crawly.Request.header())} @spec fetch(url, opts) :: HTTPoison.Response.t() when url: binary(), - opts: list() + opts: [ + with_opt + | request_opt + | headers_opt + ] def fetch(url, opts \\ []) do opts = Enum.into(opts, %{with: nil, request_options: [], headers: []}) @@ -68,13 +75,18 @@ defmodule Crawly do opts[:with] ), items <- Map.get(parsed_result, :items, []), - pipeline_result <- - Enum.reduce(items, [], fn item, acc -> - {piped, _state} = Crawly.Utils.pipe(pipelines, item, %{}) - - [acc | piped] + {pipeline_result, pipeline_state} <- + Enum.reduce(items, {[], %{}}, fn item, {acc, state} -> + {piped, state} = Crawly.Utils.pipe(pipelines, item, state) + + if piped == false do + # dropped + {acc, state} + else + {[piped | acc], state} + end end) do - {response, parsed_result, pipeline_result} + {response, parsed_result, pipeline_result, pipeline_state} end end end diff --git a/test/crawly_test.exs b/test/crawly_test.exs index a45a8414..f6d30d01 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -3,37 +3,44 @@ defmodule CrawlyTest do doctest Crawly setup do - :meck.new(CrawlyTestSpider) + :meck.new(CrawlyTestSpider, [:non_strict]) - :meck.expect(CrawlyTestSpider, :parse_items, fn resp -> + :meck.expect(CrawlyTestSpider, :parse_item, fn _resp -> %{ - items: ["hello"], + items: [%{content: "hello"}], requests: [ Crawly.Utils.request_from_url("https://www.example.com/test") ] } end) + :meck.expect(CrawlyTestSpider, :override_settings, fn -> + [pipelines: [Crawly.Pipelines.JSONEncoder]] + end) + on_exit(fn -> - :meck.unload(CrawlyTestSpider) + :meck.unload() end) + + {:ok, spider_module: CrawlyTestSpider} end test "fetch/1 is able to fetch a given url using global config, returns a response" do assert %HTTPoison.Response{} = Crawly.fetch("https://example.com") end - test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems" do - assert {%HTTPoison.Response{}, parsed_items_res, parsed_items} = - Crawly.fetch("http://example.com", with: CrawlyTestSpider) + test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems", + %{spider_module: spider_module} do + assert {%HTTPoison.Response{}, parsed_item_res, parsed_items, + pipeline_state} = + Crawly.fetch("http://example.com", with: spider_module) assert %{ - items: items, + items: [_], requests: requests - } = parsed_items_res + } = parsed_item_res - assert is_list(parsed_items) - assert length(parsed_items) == 1 - assert ["hello"] = parsed_items + assert [encoded] = parsed_items + assert encoded =~ "hello" end end From 3be5d87b2e6d893e3a4c6f35149cdf69a6e767bd Mon Sep 17 00:00:00 2001 From: Ziinc Date: Mon, 2 Nov 2020 14:09:06 +0800 Subject: [PATCH 08/10] added typespecs and docs --- lib/crawly.ex | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index d11f4ff3..7eff4973 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -13,13 +13,31 @@ defmodule Crawly do Provide a spider with the `:with` option to fetch a given webpage using that spider. + ### Fetching with a spider + To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option. + iex> Crawly.fetch("https://www.example.com", with: MySpider) + {%HTTPoison.Response{...}, %{...}, [...], %{...}} + + Using the `:with` option will return a 4 item tuple: + + 1. The HTTPoison response + 2. The result returned from the `parse_item/1` callback + 3. The list of items that have been processed by the declared item pipelines. + 4. The pipeline state, included for debugging purposes. """ @type with_opt :: {:with, nil | module()} @type request_opt :: {:request_options, list(Crawly.Request.option())} @type headers_opt :: {:headers, list(Crawly.Request.header())} - @spec fetch(url, opts) :: HTTPoison.Response.t() + @type parsed_item_result :: Crawly.ParsedItem.t() + @type parsed_items :: list(any()) + @type pipeline_state :: %{optional(atom()) => any()} + + @spec fetch(url, opts) :: + HTTPoison.Response.t() + | {HTTPoison.Response.t(), parsed_item_result, parsed_items, + pipeline_state} when url: binary(), opts: [ with_opt From 19fa4963edcbb78482b832def4fa0ef0176b040d Mon Sep 17 00:00:00 2001 From: Ziinc Date: Mon, 2 Nov 2020 14:09:41 +0800 Subject: [PATCH 09/10] corrected typo in docs --- lib/crawly.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/crawly.ex b/lib/crawly.ex index 7eff4973..fae97481 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -8,7 +8,7 @@ defmodule Crawly do when you need to get individual pages and parse them. The fetched URL is being converted to a request, and the request is piped - through the middlewares specidied in a config (with the exception of + through the middlewares specified in a config (with the exception of `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`) Provide a spider with the `:with` option to fetch a given webpage using that spider. From e2b7de80b7dfc89fce073c24080111ba50a36f75 Mon Sep 17 00:00:00 2001 From: Ziinc Date: Wed, 11 Nov 2020 23:11:16 +0800 Subject: [PATCH 10/10] remove doctest execution for Crawly module --- test/crawly_test.exs | 1 - 1 file changed, 1 deletion(-) diff --git a/test/crawly_test.exs b/test/crawly_test.exs index f6d30d01..7dbd86db 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -1,6 +1,5 @@ defmodule CrawlyTest do use ExUnit.Case - doctest Crawly setup do :meck.new(CrawlyTestSpider, [:non_strict])