-
Notifications
You must be signed in to change notification settings - Fork 115
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The current commit introduces two mix commands which are supposed to speed up spider/configuration creation for Crawly projects. This is one of the features used in Scrapy, and it allows you to make spiders almost instantly withjust one command. It simplifies my work, and hopefully, it will be helpful for others as well! Mix files are not included into coverage reports
- Loading branch information
1 parent
9eb23e1
commit 4bbf3c7
Showing
5 changed files
with
221 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"skip_files": [ | ||
"lib/mix" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
defmodule Mix.Tasks.Crawly.Gen.Config do | ||
@moduledoc """ | ||
Generate Crawly configuration | ||
A small helper that generates a crawly spider configuration | ||
""" | ||
@shortdoc "Generate example crawly config" | ||
|
||
use Mix.Task | ||
|
||
@impl Mix.Task | ||
@spec run([binary]) :: binary() | ||
def run(_args \\ []) do | ||
config_path = "config/config.exs" | ||
|
||
case File.read(config_path) do | ||
{:ok, contents} -> | ||
has_crawly_section? = String.contains?(contents, "config :crawly") | ||
|
||
case has_crawly_section? do | ||
true -> | ||
Mix.shell().error("Already has crawly section. Ignoring") | ||
|
||
false -> | ||
config_first_line = "import Config" | ||
|
||
new_content = | ||
String.replace( | ||
contents, | ||
config_first_line, | ||
crawly_config_template() | ||
) | ||
|
||
File.write!(config_path, new_content) | ||
Mix.shell().info("Done!") | ||
end | ||
|
||
{:error, reason} -> | ||
Mix.shell().info( | ||
"No config_file: #{inspect(reason)} -> creating new one" | ||
) | ||
|
||
create_config_file(config_path) | ||
Mix.shell().info("Done!") | ||
end | ||
end | ||
|
||
defp create_config_file(path) do | ||
File.mkdir("./config") | ||
File.write(path, crawly_config_template()) | ||
end | ||
|
||
defp crawly_config_template() do | ||
""" | ||
import Config | ||
config :crawly, | ||
closespider_timeout: 10, | ||
concurrent_requests_per_domain: 8, | ||
closespider_itemcount: 100, | ||
middlewares: [ | ||
Crawly.Middlewares.DomainFilter, | ||
Crawly.Middlewares.UniqueRequest, | ||
{Crawly.Middlewares.UserAgent, user_agents: ["Crawly Bot", "Google"]} | ||
], | ||
pipelines: [ | ||
# An item is expected to have all fields defined in the fields list | ||
{Crawly.Pipelines.Validate, fields: [:url]}, | ||
# Use the following field as an item uniq identifier (pipeline) drops | ||
# items with the same urls | ||
{Crawly.Pipelines.DuplicatesFilter, item_id: :url}, | ||
Crawly.Pipelines.JSONEncoder, | ||
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"} | ||
] | ||
""" | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
defmodule Mix.Tasks.Crawly.Gen.Spider do | ||
@moduledoc """ | ||
Generate Crawly spider template | ||
Reduce a bit of the boilerplate by providing spider generator function | ||
The generator function is used to generate a spider template for a given website. | ||
--filepath - specify a filepath where the spider is supposed to be generated (required) | ||
--spidername - specify a name of the spider module (required) | ||
--help - show this message | ||
""" | ||
@shortdoc "Generate Crawly Spider template" | ||
|
||
use Mix.Task | ||
|
||
@impl Mix.Task | ||
@spec run([binary]) :: binary() | ||
def run(args \\ []) do | ||
args | ||
|> parse_args() | ||
|> response() | ||
end | ||
|
||
defp response({:error, message}) do | ||
Mix.shell().error("#{inspect(message)}") | ||
help() | ||
end | ||
|
||
defp response({opts, _word}) do | ||
cond do | ||
opts[:help] != nil -> | ||
help() | ||
|
||
true -> | ||
generate_spider(opts) | ||
end | ||
end | ||
|
||
defp generate_spider(filepath: filepath, spidername: spidername) do | ||
case File.exists?(filepath) do | ||
true -> | ||
Mix.shell().error("The spider already exists. Choose another filename") | ||
|
||
false -> | ||
path = Path.join(:code.priv_dir(:crawly), "./spider_template.ex") | ||
{:ok, spider_template} = File.read(path) | ||
|
||
spider_template = | ||
String.replace(spider_template, "SpiderTemplate", spidername) | ||
|
||
:ok = File.write(filepath, spider_template) | ||
Mix.shell().info("Done!") | ||
end | ||
end | ||
|
||
defp generate_spider(_) do | ||
Mix.shell().error("Missing required arguments. \n") | ||
help() | ||
end | ||
|
||
defp parse_args(args) do | ||
{opts, word, errors} = | ||
OptionParser.parse( | ||
args, | ||
strict: [filepath: :string, spidername: :string, help: :boolean] | ||
) | ||
|
||
case errors do | ||
[] -> | ||
{opts, List.to_string(word)} | ||
|
||
errors -> | ||
{:error, "Unkown opions: #{inspect(errors)}"} | ||
end | ||
end | ||
|
||
defp help() do | ||
Mix.shell().info(""" | ||
The generator function is used to generate a spider template for a given website. | ||
--filepath (required) - specify a path for a new file. If file already exists - exit with error | ||
--spidername (required) - specify a name of the spider module | ||
--help - show this message | ||
""") | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
defmodule SpiderTemplate do | ||
use Crawly.Spider | ||
|
||
@impl Crawly.Spider | ||
def base_url(), do: "https://books.toscrape.com/" | ||
|
||
@impl Crawly.Spider | ||
def init() do | ||
[start_urls: ["https://books.toscrape.com/index.html"]] | ||
end | ||
|
||
@impl Crawly.Spider | ||
@doc """ | ||
Extract items and requests to follow from the given response | ||
""" | ||
def parse_item(response) do | ||
# Extract item field from the response here. Usually it's done this way: | ||
# {:ok, document} = Floki.parse_document(response.body) | ||
# item = %{ | ||
# title: document |> Floki.find("title") |> Floki.text() | ||
# url: response.request_url | ||
# } | ||
extracted_items = [] | ||
|
||
# Extract requests to follow from the response. Don't forget that you should | ||
# supply request objects here. Usually it's done via | ||
# | ||
# urls = document |> Floki.find(".pagination a") |> Floki.attribute("href") | ||
# Don't forget that you need absolute urls | ||
# requests = Crawly.Utils.requests_from_urls(urls) | ||
|
||
next_requests = [] | ||
%Crawly.ParsedItem{items: extracted_items, requests: next_requests} | ||
end | ||
end |