Spider and Configuration generators

The current commit introduces two mix commands which are supposed to speed up spider/configuration creation for Crawly projects. This is one of the features used in Scrapy, and it allows you to make spiders almost instantly withjust one command. It simplifies my work, and hopefully, it will be helpful for others as well! Mix files are not included into coverage reports
elixir-crawly · Mar 7, 2023 · 4bbf3c7 · 4bbf3c7
1 parent 9eb23e1
commit 4bbf3c7
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -79,9 +79,15 @@ historical archival.
         %{items: items, requests: next_requests}
       end
     end
-
    ```
 
+    **New in 0.15.0 (not released yet):**
+
+    > It's possible to use the command to speed up the spider creation,
+    so you will have a generated file with all needed callbacks:
+    `mix crawly.gen.spider --filepath ./lib/crawly_example/books_to_scrape.ex --spidername BooksToScrape`
+
+
 4. Configure Crawly
 
    By default, Crawly does not require any configuration. But obviously you will need a configuration for fine tuning the crawls:
@@ -110,6 +116,11 @@ historical archival.
 
    ```
 
+    **New in 0.15.0 (not released yet):**
+
+    > You can generate  example config with the help of the following command:
+   `mix crawly.gen.config`
+
 5. Start the Crawl:
 
    ```bash
@@ -198,4 +209,4 @@ limitations under the License.
 2. Update version in quickstart (README.md, this file)
 3. Commit and create a new tag: `git commit && git tag 0.xx.0 && git push origin master --follow-tags`
 4. Build docs: `mix docs`
-5. Publish hex release: `mix hex.publish` 
+5. Publish hex release: `mix hex.publish`
diff --git a/coveralls.json b/coveralls.json
@@ -0,0 +1,5 @@
+{
+  "skip_files": [
+    "lib/mix"
+  ]
+}
diff --git a/lib/mix/tasks/crawly.gen.config.ex b/lib/mix/tasks/crawly.gen.config.ex
@@ -0,0 +1,79 @@
+defmodule Mix.Tasks.Crawly.Gen.Config do
+  @moduledoc """
+  Generate Crawly configuration
+
+  A small helper that generates a crawly spider configuration
+  """
+  @shortdoc "Generate example crawly config"
+
+  use Mix.Task
+
+  @impl Mix.Task
+  @spec run([binary]) :: binary()
+  def run(_args \\ []) do
+    config_path = "config/config.exs"
+
+    case File.read(config_path) do
+      {:ok, contents} ->
+        has_crawly_section? = String.contains?(contents, "config :crawly")
+
+        case has_crawly_section? do
+          true ->
+            Mix.shell().error("Already has crawly section. Ignoring")
+
+          false ->
+            config_first_line = "import Config"
+
+            new_content =
+              String.replace(
+                contents,
+                config_first_line,
+                crawly_config_template()
+              )
+
+            File.write!(config_path, new_content)
+            Mix.shell().info("Done!")
+        end
+
+      {:error, reason} ->
+        Mix.shell().info(
+          "No config_file: #{inspect(reason)} -> creating new one"
+        )
+
+        create_config_file(config_path)
+        Mix.shell().info("Done!")
+    end
+  end
+
+  defp create_config_file(path) do
+    File.mkdir("./config")
+    File.write(path, crawly_config_template())
+  end
+
+  defp crawly_config_template() do
+    """
+    import Config
+
+    config :crawly,
+      closespider_timeout: 10,
+      concurrent_requests_per_domain: 8,
+      closespider_itemcount: 100,
+
+      middlewares: [
+        Crawly.Middlewares.DomainFilter,
+        Crawly.Middlewares.UniqueRequest,
+        {Crawly.Middlewares.UserAgent, user_agents: ["Crawly Bot", "Google"]}
+      ],
+      pipelines: [
+        # An item is expected to have all fields defined in the fields list
+        {Crawly.Pipelines.Validate, fields: [:url]},
+
+        # Use the following field as an item uniq identifier (pipeline) drops
+        # items with the same urls
+        {Crawly.Pipelines.DuplicatesFilter, item_id: :url},
+        Crawly.Pipelines.JSONEncoder,
+        {Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"}
+      ]
+    """
+  end
+end
diff --git a/lib/mix/tasks/crawly.gen.spider.ex b/lib/mix/tasks/crawly.gen.spider.ex
@@ -0,0 +1,89 @@
+defmodule Mix.Tasks.Crawly.Gen.Spider do
+  @moduledoc """
+  Generate Crawly spider template
+
+  Reduce a bit of the boilerplate by providing spider generator function
+
+    The generator function is used to generate a spider template for a given website.
+
+    --filepath - specify a filepath where the spider is supposed to be generated (required)
+    --spidername - specify a name of the spider module (required)
+    --help - show this message
+  """
+  @shortdoc "Generate Crawly Spider template"
+
+  use Mix.Task
+
+  @impl Mix.Task
+  @spec run([binary]) :: binary()
+  def run(args \\ []) do
+    args
+    |> parse_args()
+    |> response()
+  end
+
+  defp response({:error, message}) do
+    Mix.shell().error("#{inspect(message)}")
+    help()
+  end
+
+  defp response({opts, _word}) do
+    cond do
+      opts[:help] != nil ->
+        help()
+
+      true ->
+        generate_spider(opts)
+    end
+  end
+
+  defp generate_spider(filepath: filepath, spidername: spidername) do
+    case File.exists?(filepath) do
+      true ->
+        Mix.shell().error("The spider already exists. Choose another filename")
+
+      false ->
+        path = Path.join(:code.priv_dir(:crawly), "./spider_template.ex")
+        {:ok, spider_template} = File.read(path)
+
+        spider_template =
+          String.replace(spider_template, "SpiderTemplate", spidername)
+
+        :ok = File.write(filepath, spider_template)
+        Mix.shell().info("Done!")
+    end
+  end
+
+  defp generate_spider(_) do
+    Mix.shell().error("Missing required arguments. \n")
+    help()
+  end
+
+  defp parse_args(args) do
+    {opts, word, errors} =
+      OptionParser.parse(
+        args,
+        strict: [filepath: :string, spidername: :string, help: :boolean]
+      )
+
+    case errors do
+      [] ->
+        {opts, List.to_string(word)}
+
+      errors ->
+        {:error, "Unkown opions: #{inspect(errors)}"}
+    end
+  end
+
+  defp help() do
+    Mix.shell().info("""
+
+    The generator function is used to generate a spider template for a given website.
+
+    --filepath (required) - specify a path for a new file. If file already exists - exit with error
+    --spidername (required) - specify a name of the spider module
+    --help - show this message
+
+    """)
+  end
+end
diff --git a/priv/spider_template.ex b/priv/spider_template.ex
@@ -0,0 +1,35 @@
+defmodule SpiderTemplate do
+  use Crawly.Spider
+
+  @impl Crawly.Spider
+  def base_url(), do: "https://books.toscrape.com/"
+
+  @impl Crawly.Spider
+  def init() do
+    [start_urls: ["https://books.toscrape.com/index.html"]]
+  end
+
+  @impl Crawly.Spider
+  @doc """
+     Extract items and requests to follow from the given response
+  """
+  def parse_item(response) do
+    # Extract item field from the response here. Usually it's done this way:
+    # {:ok, document} = Floki.parse_document(response.body)
+    # item = %{
+    #   title: document |> Floki.find("title") |> Floki.text()
+    #   url: response.request_url
+    # }
+    extracted_items = []
+
+    # Extract requests to follow from the response. Don't forget that you should
+    # supply request objects here. Usually it's done via
+    #
+    # urls = document |> Floki.find(".pagination a") |> Floki.attribute("href")
+    # Don't forget that you need absolute urls
+    # requests = Crawly.Utils.requests_from_urls(urls)
+
+    next_requests = []
+    %Crawly.ParsedItem{items: extracted_items, requests: next_requests}
+  end
+end