Skip to content

Commit

Permalink
Add new fetch and parse functions
Browse files Browse the repository at this point in the history
Change fetch function so it sends requests through all middlewares
Add parse function which allows to see how given page is going to be
parsed by a given spider.
  • Loading branch information
oltarasenko committed Feb 17, 2020
1 parent 2d530e0 commit 0ac9c7f
Showing 1 changed file with 34 additions and 3 deletions.
37 changes: 34 additions & 3 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,52 @@ defmodule Crawly do

@doc """
Fetches a given url. This function is mainly used for the spiders development
when you need to get individual pages and parse them
when you need to get individual pages and parse them.
The fetched URL is being converted to a request, and the request is piped
through the middlewares specidied in a config (with the exception of
`Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are
ignored)
"""
@spec fetch(url, headers, options) :: HTTPoison.Response.t()
when url: binary(),
headers: [],
options: []
def fetch(url, headers \\ [], options \\ []) do
request = Crawly.Request.new(url, headers, options)
request0 = Crawly.Request.new(url, headers, options)
ignored_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.RobotsTxt
]
middlewares = request0.middlewares -- ignored_middlewares

{request, _} = Crawly.Utils.pipe(middlewares, request0, %{})

{fetcher, client_options} = Application.get_env(
:crawly,
:fetcher,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

fetcher.fetch(request, client_options)
{:ok, response} = fetcher.fetch(request, client_options)
response
end

@doc """
Parses a given response with a given spider. Allows to quickly see the outcome
of the given :parse_item implementation.
"""
@spec parse(response, spider) :: {:ok, result}
when response: Crawly.Response.t(),
spider: atom(),
result: Crawly.ParsedItem.t()
def parse(response, spider) do
case Kernel.function_exported?(spider, :parse_item, 1) do
false ->
{:error, :spider_not_found}
true ->
spider.parse_item(response)
end
end
end

0 comments on commit 0ac9c7f

Please sign in to comment.