Skip to content

Commit

Permalink
Spider and Configuration generators
Browse files Browse the repository at this point in the history
The current commit introduces two mix commands which are supposed to speed
up spider/configuration creation for Crawly projects.

This is one of the features used in Scrapy, and it allows you to make spiders
almost instantly withjust one command. It simplifies my work, and hopefully, it will be helpful for others as well!

Mix files are not included into coverage reports
  • Loading branch information
oltarasenko committed Mar 7, 2023
1 parent 9eb23e1 commit 4bbf3c7
Show file tree
Hide file tree
Showing 5 changed files with 221 additions and 2 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,15 @@ historical archival.
%{items: items, requests: next_requests}
end
end

```

**New in 0.15.0 (not released yet):**

> It's possible to use the command to speed up the spider creation,
so you will have a generated file with all needed callbacks:
`mix crawly.gen.spider --filepath ./lib/crawly_example/books_to_scrape.ex --spidername BooksToScrape`


4. Configure Crawly

By default, Crawly does not require any configuration. But obviously you will need a configuration for fine tuning the crawls:
Expand Down Expand Up @@ -110,6 +116,11 @@ historical archival.

```

**New in 0.15.0 (not released yet):**

> You can generate example config with the help of the following command:
`mix crawly.gen.config`

5. Start the Crawl:

```bash
Expand Down Expand Up @@ -198,4 +209,4 @@ limitations under the License.
2. Update version in quickstart (README.md, this file)
3. Commit and create a new tag: `git commit && git tag 0.xx.0 && git push origin master --follow-tags`
4. Build docs: `mix docs`
5. Publish hex release: `mix hex.publish`
5. Publish hex release: `mix hex.publish`
5 changes: 5 additions & 0 deletions coveralls.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"skip_files": [
"lib/mix"
]
}
79 changes: 79 additions & 0 deletions lib/mix/tasks/crawly.gen.config.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
defmodule Mix.Tasks.Crawly.Gen.Config do
@moduledoc """
Generate Crawly configuration
A small helper that generates a crawly spider configuration
"""
@shortdoc "Generate example crawly config"

use Mix.Task

@impl Mix.Task
@spec run([binary]) :: binary()
def run(_args \\ []) do
config_path = "config/config.exs"

case File.read(config_path) do
{:ok, contents} ->
has_crawly_section? = String.contains?(contents, "config :crawly")

case has_crawly_section? do
true ->
Mix.shell().error("Already has crawly section. Ignoring")

false ->
config_first_line = "import Config"

new_content =
String.replace(
contents,
config_first_line,
crawly_config_template()
)

File.write!(config_path, new_content)
Mix.shell().info("Done!")
end

{:error, reason} ->
Mix.shell().info(
"No config_file: #{inspect(reason)} -> creating new one"
)

create_config_file(config_path)
Mix.shell().info("Done!")
end
end

defp create_config_file(path) do
File.mkdir("./config")
File.write(path, crawly_config_template())
end

defp crawly_config_template() do
"""
import Config
config :crawly,
closespider_timeout: 10,
concurrent_requests_per_domain: 8,
closespider_itemcount: 100,
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
{Crawly.Middlewares.UserAgent, user_agents: ["Crawly Bot", "Google"]}
],
pipelines: [
# An item is expected to have all fields defined in the fields list
{Crawly.Pipelines.Validate, fields: [:url]},
# Use the following field as an item uniq identifier (pipeline) drops
# items with the same urls
{Crawly.Pipelines.DuplicatesFilter, item_id: :url},
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"}
]
"""
end
end
89 changes: 89 additions & 0 deletions lib/mix/tasks/crawly.gen.spider.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
defmodule Mix.Tasks.Crawly.Gen.Spider do
@moduledoc """
Generate Crawly spider template
Reduce a bit of the boilerplate by providing spider generator function
The generator function is used to generate a spider template for a given website.
--filepath - specify a filepath where the spider is supposed to be generated (required)
--spidername - specify a name of the spider module (required)
--help - show this message
"""
@shortdoc "Generate Crawly Spider template"

use Mix.Task

@impl Mix.Task
@spec run([binary]) :: binary()
def run(args \\ []) do
args
|> parse_args()
|> response()
end

defp response({:error, message}) do
Mix.shell().error("#{inspect(message)}")
help()
end

defp response({opts, _word}) do
cond do
opts[:help] != nil ->
help()

true ->
generate_spider(opts)
end
end

defp generate_spider(filepath: filepath, spidername: spidername) do
case File.exists?(filepath) do
true ->
Mix.shell().error("The spider already exists. Choose another filename")

false ->
path = Path.join(:code.priv_dir(:crawly), "./spider_template.ex")
{:ok, spider_template} = File.read(path)

spider_template =
String.replace(spider_template, "SpiderTemplate", spidername)

:ok = File.write(filepath, spider_template)
Mix.shell().info("Done!")
end
end

defp generate_spider(_) do
Mix.shell().error("Missing required arguments. \n")
help()
end

defp parse_args(args) do
{opts, word, errors} =
OptionParser.parse(
args,
strict: [filepath: :string, spidername: :string, help: :boolean]
)

case errors do
[] ->
{opts, List.to_string(word)}

errors ->
{:error, "Unkown opions: #{inspect(errors)}"}
end
end

defp help() do
Mix.shell().info("""
The generator function is used to generate a spider template for a given website.
--filepath (required) - specify a path for a new file. If file already exists - exit with error
--spidername (required) - specify a name of the spider module
--help - show this message
""")
end
end
35 changes: 35 additions & 0 deletions priv/spider_template.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
defmodule SpiderTemplate do
use Crawly.Spider

@impl Crawly.Spider
def base_url(), do: "https://books.toscrape.com/"

@impl Crawly.Spider
def init() do
[start_urls: ["https://books.toscrape.com/index.html"]]
end

@impl Crawly.Spider
@doc """
Extract items and requests to follow from the given response
"""
def parse_item(response) do
# Extract item field from the response here. Usually it's done this way:
# {:ok, document} = Floki.parse_document(response.body)
# item = %{
# title: document |> Floki.find("title") |> Floki.text()
# url: response.request_url
# }
extracted_items = []

# Extract requests to follow from the response. Don't forget that you should
# supply request objects here. Usually it's done via
#
# urls = document |> Floki.find(".pagination a") |> Floki.attribute("href")
# Don't forget that you need absolute urls
# requests = Crawly.Utils.requests_from_urls(urls)

next_requests = []
%Crawly.ParsedItem{items: extracted_items, requests: next_requests}
end
end

0 comments on commit 4bbf3c7

Please sign in to comment.