diff --git a/lib/crawly.ex b/lib/crawly.ex index 288f2d1f..adcac66f 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -5,14 +5,27 @@ defmodule Crawly do @doc """ Fetches a given url. This function is mainly used for the spiders development - when you need to get individual pages and parse them + when you need to get individual pages and parse them. + + The fetched URL is being converted to a request, and the request is piped + through the middlewares specidied in a config (with the exception of + `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are + ignored) + """ @spec fetch(url, headers, options) :: HTTPoison.Response.t() when url: binary(), headers: [], options: [] def fetch(url, headers \\ [], options \\ []) do - request = Crawly.Request.new(url, headers, options) + request0 = Crawly.Request.new(url, headers, options) + ignored_middlewares = [ + Crawly.Middlewares.DomainFilter, + Crawly.Middlewares.RobotsTxt + ] + middlewares = request0.middlewares -- ignored_middlewares + + {request, _} = Crawly.Utils.pipe(middlewares, request0, %{}) {fetcher, client_options} = Application.get_env( :crawly, @@ -20,6 +33,24 @@ defmodule Crawly do {Crawly.Fetchers.HTTPoisonFetcher, []} ) - fetcher.fetch(request, client_options) + {:ok, response} = fetcher.fetch(request, client_options) + response + end + + @doc """ + Parses a given response with a given spider. Allows to quickly see the outcome + of the given :parse_item implementation. + """ + @spec parse(response, spider) :: {:ok, result} + when response: Crawly.Response.t(), + spider: atom(), + result: Crawly.ParsedItem.t() + def parse(response, spider) do + case Kernel.function_exported?(spider, :parse_item, 1) do + false -> + {:error, :spider_not_found} + true -> + spider.parse_item(response) + end end end