Skip to content

Commit

Permalink
Merge d3ed3f2 into f2e0e93
Browse files Browse the repository at this point in the history
  • Loading branch information
Raphael Megzari committed Feb 12, 2020
2 parents f2e0e93 + d3ed3f2 commit d47c266
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 12 deletions.
2 changes: 2 additions & 0 deletions config/config.exs
Expand Up @@ -2,6 +2,8 @@ use Mix.Config

config :logger, backends: [:console], compile_time_purge_level: :info

config :hackney, use_default_pool: false

if File.exists?("config/#{Mix.env}.exs") do
import_config("#{Mix.env}.exs")
end
2 changes: 1 addition & 1 deletion lib/crawler.ex
Expand Up @@ -13,7 +13,7 @@ defmodule Crawler do
- a `Crawler.Store` that initiates a `Registry` for keeping internal data
"""
def start(_type, _args) do
{:ok, _pid} = Store.init()
{:ok, _pid} = Store.start_link()
end

@doc """
Expand Down
62 changes: 51 additions & 11 deletions lib/crawler/store.ex
@@ -1,9 +1,14 @@
defmodule Crawler.Store do
@moduledoc """
An internal data store for information related to each crawl.
include the ets in a gen_server in order for it to be garbage collected.
However don't go through the process to access it, for the process to not
be a bottleneck.
"""
use GenServer
@table :db

alias __MODULE__.DB
@type url :: String.t

defmodule Page do
@moduledoc """
Expand All @@ -16,15 +21,28 @@ defmodule Crawler.Store do
@doc """
Initialises a new `Registry` named `Crawler.Store.DB`.
"""
def init do
Registry.start_link(keys: :unique, name: DB)
def start_link do
GenServer.start_link(__MODULE__, nil, name: __MODULE__)
end

def init(nil) do
:ets.new(@table, [
:set,
:public,
:named_table,
read_concurrency: true,
write_concurrency: true
])

{:ok, nil}
end

@doc """
Finds a stored URL and returns its page data.
"""
@spec find(url) :: %Page{} | nil
def find(url) do
case Registry.lookup(DB, url) do
case :ets.lookup(@table, url) do
[{_, page}] -> page
_ -> nil
end
Expand All @@ -33,8 +51,9 @@ defmodule Crawler.Store do
@doc """
Finds a stored URL and returns its page data only if it's processed.
"""
@spec find_processed(url) :: %Page{} | nil
def find_processed(url) do
case Registry.match(DB, url, %{processed: true}) do
case :ets.match_object(@table, {url, %{processed: true}}) do
[{_, page}] -> page
_ -> nil
end
Expand All @@ -43,21 +62,42 @@ defmodule Crawler.Store do
@doc """
Adds a URL to the registry.
"""
@spec add(url) :: {:ok, boolean}
def add(url) do
Registry.register(DB, url, %Page{url: url})
{:ok, :ets.insert_new(@table, {url, %Page{url: url}})}
end

@spec update(url, map) :: boolean
defp update(url, args) do
case find(url) do
nil -> false
page ->
page
|> Map.merge(args)
|> update!()
end
end

@spec update!(%Page{}) :: boolean
defp update!(page) do
:ets.insert(@table, {page.url, page})
end

@doc """
Adds the page data for a URL to the registry.
"""
def add_page_data(url, body, opts) do
{_new, _old} = Registry.update_value(DB, url, & %{&1 | body: body, opts: opts})
end
def add_page_data(url, body, opts), do: update(url, %{body: body, opts: opts})

@doc """
Marks a URL as processed in the registry.
"""
def processed(url) do
{_new, _old} = Registry.update_value(DB, url, & %{&1 | processed: true})
def processed(url), do: update(url, %{processed: true})

@doc """
clear the store of all the pages.
Useful for periodic crawl tasks.
"""
def reset() do
:ets.delete_all_objects(@table)
end
end

0 comments on commit d47c266

Please sign in to comment.