Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try crawly with a modern Elixir #221

Merged
merged 4 commits into from
Sep 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
build:
docker:
# specify the version here
- image: circleci/elixir:1.10
- image: elixir:1.14

working_directory: ~/repo
steps:
Expand Down
2 changes: 1 addition & 1 deletion .tool-versions
Original file line number Diff line number Diff line change
@@ -1 +1 @@
elixir 1.10.3-otp-23
elixir 1.14.0
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ historical archival.

## Requirements

1. Elixir `~> 1.10`
1. Elixir `~> 1.14`
2. Works on GNU/Linux, Windows, macOS X, and BSD.


Expand Down
4 changes: 4 additions & 0 deletions examples/quickstart/.formatter.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
]
26 changes: 26 additions & 0 deletions examples/quickstart/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# The directory Mix will write compiled artifacts to.
/_build/

# If you run "mix test --cover", coverage assets end up here.
/cover/

# The directory Mix downloads your dependencies sources to.
/deps/

# Where third-party dependencies like ExDoc output generated docs.
/doc/

# Ignore .fetch files in case you like to edit your project deps locally.
/.fetch

# If the VM crashes, it generates a dump, let's ignore it too.
erl_crash.dump

# Also ignore archive artifacts (built via "mix archive.build").
*.ez

# Ignore package tarball (built via "mix hex.build").
quickstart-*.tar

# Temporary files, for example, from tests.
/tmp/
21 changes: 21 additions & 0 deletions examples/quickstart/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Quickstart

**TODO: Add description**

## Installation

If [available in Hex](https://hex.pm/docs/publish), the package can be installed
by adding `quickstart` to your list of dependencies in `mix.exs`:

```elixir
def deps do
[
{:quickstart, "~> 0.1.0"}
]
end
```

Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc)
and published on [HexDocs](https://hexdocs.pm). Once published, the docs can
be found at <https://hexdocs.pm/quickstart>.

19 changes: 19 additions & 0 deletions examples/quickstart/config/config.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import Config

# in config.exs
config :crawly,
closespider_timeout: 10,
concurrent_requests_per_domain: 8,
closespider_itemcount: 100,

middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
{Crawly.Middlewares.UserAgent, user_agents: ["Crawly Bot"]}
],
pipelines: [
{Crawly.Pipelines.Validate, fields: [:url, :title, :price]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"}
]
18 changes: 18 additions & 0 deletions examples/quickstart/lib/quickstart.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
defmodule Quickstart do
@moduledoc """
Documentation for `Quickstart`.
"""

@doc """
Hello world.

## Examples

iex> Quickstart.hello()
:world

"""
def hello do
:world
end
end
20 changes: 20 additions & 0 deletions examples/quickstart/lib/quickstart/application.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defmodule Quickstart.Application do
# See https://hexdocs.pm/elixir/Application.html
# for more information on OTP Applications
@moduledoc false

use Application

@impl true
def start(_type, _args) do
children = [
# Starts a worker by calling: Quickstart.Worker.start_link(arg)
# {Quickstart.Worker, arg}
]

# See https://hexdocs.pm/elixir/Supervisor.html
# for other strategies and supported options
opts = [strategy: :one_for_one, name: Quickstart.Supervisor]
Supervisor.start_link(children, opts)
end
end
40 changes: 40 additions & 0 deletions examples/quickstart/lib/quickstart/books_spider.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
defmodule BooksToScrape do
use Crawly.Spider

@impl Crawly.Spider
def base_url(), do: "https://books.toscrape.com/"

@impl Crawly.Spider
def init() do
[start_urls: ["https://books.toscrape.com/"]]
end

@impl Crawly.Spider
def parse_item(response) do
# Parse response body to document
{:ok, document} = Floki.parse_document(response.body)

# Create item (for pages where items exists)
items =
document
|> Floki.find(".product_pod")
|> Enum.map(fn x ->
%{
title: Floki.find(x, "h3 a") |> Floki.attribute("title") |> Floki.text(),
price: Floki.find(x, ".product_price .price_color") |> Floki.text(),
url: response.request_url
}
end)

next_requests =
document
|> Floki.find(".next a")
|> Floki.attribute("href")
|> Enum.map(fn url ->
Crawly.Utils.build_absolute_url(url, response.request.url)
|> Crawly.Utils.request_from_url()
end)

%{items: items, requests: next_requests}
end
end
29 changes: 29 additions & 0 deletions examples/quickstart/mix.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
defmodule Quickstart.MixProject do
use Mix.Project

def project do
[
app: :quickstart,
version: "0.1.0",
elixir: "~> 1.14",
start_permanent: Mix.env() == :prod,
deps: deps()
]
end

# Run "mix help compile.app" to learn about applications.
def application do
[
extra_applications: [:logger],
mod: {Quickstart.Application, []}
]
end

# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:crawly, path: "../.."},
{:floki, "~> 0.26.0"}
]
end
end
25 changes: 25 additions & 0 deletions examples/quickstart/mix.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
%{
"certifi": {:hex, :certifi, "2.9.0", "6f2a475689dd47f19fb74334859d460a2dc4e3252a3324bd2111b8f0429e7e21", [:rebar3], [], "hexpm", "266da46bdb06d6c6d35fde799bcb28d36d985d424ad7c08b5bb48f5b5cdd4641"},
"cowboy": {:hex, :cowboy, "2.9.0", "865dd8b6607e14cf03282e10e934023a1bd8be6f6bacf921a7e2a96d800cd452", [:make, :rebar3], [{:cowlib, "2.11.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "2c729f934b4e1aa149aff882f57c6372c15399a20d54f65c8d67bef583021bde"},
"cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
"cowlib": {:hex, :cowlib, "2.11.0", "0b9ff9c346629256c42ebe1eeb769a83c6cb771a6ee5960bd110ab0b9b872063", [:make, :rebar3], [], "hexpm", "2b3e9da0b21c4565751a6d4901c20d1b4cc25cbb7fd50d91d2ab6dd287bc86a9"},
"elixir_uuid": {:hex, :elixir_uuid, "1.2.1", "dce506597acb7e6b0daeaff52ff6a9043f5919a4c3315abb4143f0b00378c097", [:mix], [], "hexpm", "f7eba2ea6c3555cea09706492716b0d87397b88946e6380898c2889d68585752"},
"floki": {:hex, :floki, "0.26.0", "4df88977e2e357c6720e1b650f613444bfb48c5acfc6a0c646ab007d08ad13bf", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "e7b66ce7feef5518a9cd9fc7b52dd62a64028bd9cb6d6ad282a0f0fc90a4ae52"},
"gollum": {:hex, :new_gollum, "0.4.0", "89e3e2fc5abd032455341c4a03bcef7042b8d08e02c51df24b99a1a0a1ad69b1", [:mix], [{:httpoison, "~> 1.7", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "85c68465e8678637638656945677062a4e7086e91a04d5c4bca1027321c74582"},
"hackney": {:hex, :hackney, "1.18.1", "f48bf88f521f2a229fc7bae88cf4f85adc9cd9bcf23b5dc8eb6a1788c662c4f6", [:rebar3], [{:certifi, "~>2.9.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a4ecdaff44297e9b5894ae499e9a070ea1888c84afdd1fd9b7b2bc384950128e"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"httpoison": {:hex, :httpoison, "1.8.2", "9eb9c63ae289296a544842ef816a85d881d4a31f518a0fec089aaa744beae290", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "2bb350d26972e30c96e2ca74a1aaf8293d61d0742ff17f01e0279fef11599921"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "2.0.3", "3676436d3d1f7b81b5a2d2bd8405f412c677558c81b1c92be58c00562bb59095", [:mix], [], "hexpm", "27a30bf0db44d25eecba73755acf4068cbfe26a4372f9eb3e4ea3a45956bff6b"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
"plug": {:hex, :plug, "1.13.6", "187beb6b67c6cec50503e940f0434ea4692b19384d47e5fdfd701e93cadb4cc2", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "02b9c6b9955bce92c829f31d6284bf53c591ca63c4fb9ff81dfd0418667a34ff"},
"plug_cowboy": {:hex, :plug_cowboy, "2.5.2", "62894ccd601cf9597e2c23911ff12798a8a18d237e9739f58a6b04e4988899fe", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "ea6e87f774c8608d60c8d34022a7d073bd7680a0a013f049fc62bf35efea1044"},
"plug_crypto": {:hex, :plug_crypto, "1.2.3", "8f77d13aeb32bfd9e654cb68f0af517b371fb34c56c9f2b58fe3df1235c1251a", [:mix], [], "hexpm", "b5672099c6ad5c202c45f5a403f21a3411247f164e4a8fab056e5cd8a290f4a2"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"},
"ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"},
"telemetry": {:hex, :telemetry, "1.1.0", "a589817034a27eab11144ad24d5c0f9fab1f58173274b1e9bae7074af9cbee51", [:rebar3], [], "hexpm", "b727b2a1f75614774cff2d7565b64d0dfa5bd52ba517f16543e6fc7efcc0df48"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
}
8 changes: 8 additions & 0 deletions examples/quickstart/test/quickstart_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
defmodule QuickstartTest do
use ExUnit.Case
doctest Quickstart

test "greets the world" do
assert Quickstart.hello() == :world
end
end
1 change: 1 addition & 0 deletions examples/quickstart/test/test_helper.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ExUnit.start()
3 changes: 2 additions & 1 deletion lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ defmodule Crawly.DataStorage.Worker do
{false, new_state} ->
new_state

{_new_item, new_state} ->
{new_item, new_state} ->
Logger.debug("Stored item: #{inspect(new_item)}")
%Worker{new_state | stored_items: state.stored_items + 1}
end

Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/pipelines/write_to_file.ex
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,7 @@ defmodule Crawly.Pipelines.WriteToFile do
catch
error, reason ->
Logger.error(
"Could not write item: #{inspect(item)} to io: #{inspect(io)}\n#{
Exception.format(error, reason, __STACKTRACE__)
}"
"Could not write item: #{inspect(item)} to io: #{inspect(io)}\n#{Exception.format(error, reason, __STACKTRACE__)}"
)
end
end
Expand Down
12 changes: 3 additions & 9 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,11 @@ defmodule Crawly.Utils do
"#{inspect(module)}.run(#{inspect(item)}, #{inspect(state)})"

_ ->
"#{inspect(module)}.run(#{inspect(item)}, #{inspect(state)}, #{
inspect(args)
})"
"#{inspect(module)}.run(#{inspect(item)}, #{inspect(state)}, #{inspect(args)})"
end

Logger.error(
"Pipeline crash by call: #{call}\n#{
Exception.format(error, reason, __STACKTRACE__)
}"
"Pipeline crash by call: #{call}\n#{Exception.format(error, reason, __STACKTRACE__)}"
)

{item, state}
Expand Down Expand Up @@ -239,9 +235,7 @@ defmodule Crawly.Utils do
{module, []}

x ->
raise "Invalid format: A #{setting} setting cannot be defined in the form `{#{
inspect(x)
}}`. Only the forms `{module, options}` and `module` are valid"
raise "Invalid format: A #{setting} setting cannot be defined in the form `{#{inspect(x)}}`. Only the forms `{module, options}` and `module` are valid"
end
end
end
12 changes: 3 additions & 9 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ defmodule Crawly.Worker do
else
{:error, reason} ->
Logger.debug(
"Crawly worker could not process the request to #{
inspect(request.url)
} reason: #{inspect(reason)}"
"Crawly worker could not process the request to #{inspect(request.url)} reason: #{inspect(reason)}"
)
end

Expand Down Expand Up @@ -125,9 +123,7 @@ defmodule Crawly.Worker do
catch
error, reason ->
Logger.debug(
"Could not parse item, error: #{inspect(error)}, reason: #{
inspect(reason)
}"
"Could not parse item, error: #{inspect(error)}, reason: #{inspect(reason)}"
)

Logger.debug(Exception.format(:error, error, __STACKTRACE__))
Expand All @@ -146,9 +142,7 @@ defmodule Crawly.Worker do
}) do
{false, _} ->
Logger.debug(
"Dropped parse item from parser pipeline, url: #{response.request_url}, spider_name: #{
inspect(spider_name)
}"
"Dropped parse item from parser pipeline, url: #{response.request_url}, spider_name: #{inspect(spider_name)}"
)

throw(:dropped_parse_item)
Expand Down
4 changes: 2 additions & 2 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ defmodule Crawly.Mixfile do
app: :crawly,
version: @version,
name: "Crawly",
elixir: "~> 1.7",
elixir: "~> 1.14",
package: package(),
test_coverage: [tool: ExCoveralls],
start_permanent: Mix.env() == :prod,
Expand Down Expand Up @@ -55,7 +55,7 @@ defmodule Crawly.Mixfile do
{:ex_doc, ">= 0.0.0", only: :dev, runtime: false},
{:earmark, "~> 1.2", only: :dev},
{:meck, "~> 0.9", only: :test},
{:excoveralls, "~> 0.10", only: :test},
{:excoveralls, "~> 0.14.6", only: :test},
{:logger_file_backend, "~> 0.0.11", only: [:test, :dev]}
]
end
Expand Down
Loading