Skip to content

Commit

Permalink
Merge 7498999 into deca4ea
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed May 16, 2020
2 parents deca4ea + 7498999 commit 8c51588
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ erl_crash.dump
.DS_Store
.idea/
crawly.iml
.write_to_filetests
2 changes: 1 addition & 1 deletion documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ config :crawly,
{Crawly.Pipelines.Validate, fields: [:id, :date]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :id},
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"} # NEW IN 0.6.0
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp", include_timestamp: true}
]
```

Expand Down
53 changes: 41 additions & 12 deletions lib/crawly/pipelines/write_to_file.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ defmodule Crawly.Pipelines.WriteToFile do
### Options
In the absence of tuple-based options being passed, the pipeline will fallback onto the config of `:crawly`, `Crawly.Pipelines.WriteToFile`, for the `:folder` and `:extension` keys
- `:folder`, optional. The folder in which the file will be created. Defaults to system temp folder.
- `:folder`, optional. The folder in which the file will be created. Defaults to current project's folder.
If provided folder does not exist it's created.
- `:extension`, optional. The file extension in which the file will be created with. Defaults to `jl`.
- `:include_timestamp`, boolean, optional, true by default. Allows to add timestamp to the filename.
### Example Declaration
```
pipelines: [
Expand Down Expand Up @@ -44,7 +45,11 @@ defmodule Crawly.Pipelines.WriteToFile do
state :: %{
optional(:write_to_file_fd) => pid | {:file_descriptor, atom, any}
},
opts :: [folder: String.t(), extension: String.t()]
opts :: [
folder: String.t(),
extension: String.t(),
include_timestamp: boolean()
]
) ::
{item :: any,
state :: %{write_to_file_fd: pid | {:file_descriptor, atom, any}}}
Expand All @@ -57,22 +62,35 @@ defmodule Crawly.Pipelines.WriteToFile do

# No active FD
def run(item, state, opts) do
opts = Enum.into(opts, %{folder: nil, extension: nil})
opts =
Enum.into(opts, %{folder: nil, extension: nil, include_timestamp: true})

folder = Map.get(opts, :folder, "./")

:ok = maybe_create_folder(folder)

extension = Map.get(opts, :extension, "jl")

filename =
case Map.get(opts, :include_timestamp, false) do
false ->
"#{inspect(state.spider_name)}.#{extension}"

folder =
Map.get(opts, :folder, "./")
true ->
ts_string =
NaiveDateTime.utc_now()
|> NaiveDateTime.to_string()
|> String.replace(~r/( |-|:|\.)/, "_")

extension =
Map.get(opts, :extension, "jl")
"#{inspect(state.spider_name)}_#{ts_string}.#{extension}"
end

fd = open_fd(state.spider_name, folder, extension)
fd = open_fd(folder, filename)
:ok = write(fd, item)
{item, Map.put(state, :write_to_file_fd, fd)}
end

defp open_fd(spider_name, folder, extension) do
filename = "#{inspect(spider_name)}.#{extension}"

defp open_fd(folder, filename) do
# Open file descriptor to write items
{:ok, io_device} =
File.open(
Expand Down Expand Up @@ -108,4 +126,15 @@ defmodule Crawly.Pipelines.WriteToFile do
)
end
end

# Creates a folder if it does not exist
defp maybe_create_folder(path) do
case File.exists?(path) do
false ->
File.mkdir_p(path)

true ->
:ok
end
end
end
119 changes: 117 additions & 2 deletions test/pipelines/write_to_file_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ defmodule Pipelines.WriteToFileTest do
use ExUnit.Case, async: false

@binary "Some binary"
@test_path "./write_to_filetests/write_to_file_folder"

setup do
File.rm(@test_path)

on_exit(fn ->
:meck.unload(IO)
:meck.unload(File)
:meck.unload()
end)
end

Expand Down Expand Up @@ -77,4 +79,117 @@ defmodule Pipelines.WriteToFileTest do

assert_receive @binary
end

test "Create a folder if write folder does not exist", _context do
test_pid = self()

:meck.expect(
IO,
:write,
fn _, item ->
send(test_pid, item)
:ok
end
)

:meck.expect(
File,
:open,
fn _path, _opts ->
{:ok, test_pid}
end
)

pipelines = [
{Crawly.Pipelines.WriteToFile, folder: @test_path, extension: "csv"}
]

item = @binary

state = %{spider_name: MySpider}

# run the pipeline
_result = Crawly.Utils.pipe(pipelines, item, state)

assert File.exists?(@test_path)
end

test "Timestamp is added to the file by default", _context do
ts = "2020-05-13 09:06:22.668828"
expected_ts = "2020_05_13_09_06_22_668828"
test_pid = self()

:meck.expect(
IO,
:write,
fn _, _item ->
:ok
end
)

:meck.expect(
File,
:open,
fn path, _opts ->
send(test_pid, path)
end
)

:meck.expect(
NaiveDateTime,
:to_string,
fn _ -> ts end
)

pipelines = [
{Crawly.Pipelines.WriteToFile, folder: "/tmp", extension: "csv"}
]

item = @binary

state = %{spider_name: MySpider}

# run the pipeline
_result = Crawly.Utils.pipe(pipelines, item, state)

receive do
msg ->
assert String.contains?(msg, expected_ts)
after
500 -> assert false
end
end

test "Timestamp is not added if relevant option disabled", _context do
test_pid = self()

:meck.expect(
IO,
:write,
fn _, _item ->
:ok
end
)

:meck.expect(
File,
:open,
fn path, _opts ->
send(test_pid, path)
end
)

pipelines = [
{Crawly.Pipelines.WriteToFile,
folder: "/tmp", extension: "csv", include_timestamp: false}
]

item = @binary

state = %{spider_name: MySpider}

# run the pipeline
_result = Crawly.Utils.pipe(pipelines, item, state)
assert_receive "/tmp/MySpider.csv"
end
end

0 comments on commit 8c51588

Please sign in to comment.