Skip to content

Commit

Permalink
Use SHA1(Entry) to distribute entries in sub dirs
Browse files Browse the repository at this point in the history
Previously, we wrote all entries to $BUCKET/encode($ENTRY).
The bucket subdirectory could become large enough to run into file
system limits on the number of files in a directory or file system
performance issues with finding entries in a large directory.

The new format takes the following form:

    $BUCKET/$AA/$BB/$CC/$DD/encode($ENTRY)

where $AA, ..., $DD correspond to the first four bytes of
SHA(encode($ENTRY)) in hex.

Each subdirectory will contain at most 256 items. This keeps the
directory size well below file system limits. We chose two character
subdirs so that filesystems with linear directory search algorithms
would be able to find entries quickly. The cost of the subdirectories
is use of inodes. So for a given set of files we incur a 4x cost in
inode use.

We chose four levels as an unscientific compromise of inode cost and
size of fileset that will avoid large directories. With four levels,
we expect to be able to handle ~2^32 files consuming ~2^34 inodes.

By keeping the basename of the file the encoded entry, we can support
bucket listing via recursive file globbing and end up with an on disk
format that is a bit easier for a human to understand/debug.

Also worth noting is that we do not remove empty subdirectories when
entries are deleted. At this time we plan to leave this as known
behavior: inode usage will increase over time. Inode use can be
reclaimed by removing empty directories and this could be done
manually with the bookshelf service stopped. If this becomes a pain
point, we could establish optional behavior on restart to do the
cleaning. Removing the directories on-demand introduces a need for
locking which we'd prefer to avoid if possible.

Misc Dialyzer fix ups

* Remove unused fun head for entry_delete

* Add opscoderl_wm to analysis
  • Loading branch information
Seth Falcon committed Sep 10, 2013
1 parent c9870a1 commit a4bfaa1
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 41 deletions.
3 changes: 2 additions & 1 deletion Makefile
Expand Up @@ -9,7 +9,8 @@ DIALYZER_DEPS = deps/erlsom/ebin \
deps/iso8601/ebin \
deps/mini_s3/ebin \
deps/mochiweb/ebin \
deps/webmachine/ebin
deps/webmachine/ebin \
deps/opscoderl_wm/ebin

DEPS_PLT = bookshelf.plt

Expand Down
32 changes: 19 additions & 13 deletions src/bksw_io.erl
Expand Up @@ -37,6 +37,8 @@

-record(entryref, {fd :: file:io_device(),
path :: string() | binary(),
bucket :: binary(),
entry :: binary(),
ctx :: undefined | binary()}).

-include_lib("kernel/include/file.hrl").
Expand All @@ -61,7 +63,7 @@ bucket_list() ->
entry_list(Bucket) ->
BucketPath = bksw_io_names:bucket_path(Bucket),
%% As of R16, second arg to filelib:wildcard must be string
filter_entries(Bucket, filelib:wildcard("*", bksw_util:to_string(BucketPath))).
filter_entries(Bucket, filelib:wildcard("*/*/*/*/*", bksw_util:to_string(BucketPath))).

-spec bucket_exists(binary()) -> boolean().
bucket_exists(Bucket) ->
Expand Down Expand Up @@ -99,9 +101,7 @@ delete_bucket_dir(Bucket) ->
entry_delete(Bucket, Entry) ->
entry_delete(bksw_io_names:entry_path(Bucket, Entry)).

-spec entry_delete(#object{} | binary()) -> boolean().
entry_delete(#object{path=Path}) ->
entry_delete(bksw_io_names:entry_path(Path));
-spec entry_delete(binary()) -> boolean().
entry_delete(FullPath) ->
case file:delete(FullPath) of
ok ->
Expand All @@ -118,16 +118,17 @@ entry_exists(Bucket, Path) ->

-spec open_for_write(binary(), binary()) -> {ok, #entryref{}} | {error, term()}.
open_for_write(Bucket, Entry) ->
EntryPath = bksw_io_names:entry_path(Bucket, Entry),
FileName = bksw_io_names:write_path(EntryPath),
FileName = bksw_io_names:write_path(Bucket, Entry),
filelib:ensure_dir(FileName),
case file:open(FileName, [exclusive, write, binary]) of
{ok, Fd} ->
%% Magic number to guard against file corruption
case file:write(Fd, ?MAGIC_NUMBER) of
ok ->
{ok, ?TOTAL_HEADER_SIZE_BYTES} = file:position(Fd, {bof, ?TOTAL_HEADER_SIZE_BYTES}),
{ok, #entryref{fd=Fd, path=FileName, ctx=erlang:md5_init()}};
{ok, #entryref{fd=Fd, path=FileName,
bucket=Bucket, entry=Entry,
ctx=erlang:md5_init()}};
Error ->
file:close(Fd),
Error
Expand Down Expand Up @@ -211,20 +212,25 @@ abort_write(#entryref{fd=Fd, path=Path}) ->
file:delete(Path).

-spec finish_write(#entryref{}) -> {ok, binary()} | {error, file:posix() | badarg}.
finish_write(#entryref{fd=Fd, path=Path, ctx=Ctx}) ->
finish_write(#entryref{fd=Fd, path=Path, bucket=Bucket, entry=Entry, ctx=Ctx}) ->
case file:sync(Fd) of
ok ->
Digest = erlang:md5_final(Ctx),
%% Seek to metadata section of file
{ok, ?MAGIC_NUMBER_SIZE_BYTES} = file:position(Fd, {bof, ?MAGIC_NUMBER_SIZE_BYTES}),
file:write(Fd, Digest),
file:close(Fd),
Entry = bksw_io_names:write_path_to_entry(Path),
case file:rename(Path, Entry) of
FinalPath = bksw_io_names:entry_path(Bucket, Entry),
case filelib:ensure_dir(FinalPath) of
ok ->
{ok, Digest};
Error ->
Error
case file:rename(Path, FinalPath) of
ok ->
{ok, Digest};
Error ->
Error
end;
DirError ->
DirError
end;
Error ->
file:close(Fd),
Expand Down
57 changes: 30 additions & 27 deletions src/bksw_io_names.erl
Expand Up @@ -23,9 +23,8 @@
entry_path/1,
entry_path/2,
parse_path/1,
write_path/2,
write_path/1,
write_path_to_entry/1]).
write_path/2
]).

-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
Expand All @@ -47,11 +46,24 @@ bucket_path(Bucket) when Bucket =/= <<>> ->

entry_path(BucketEntryPath) when BucketEntryPath =/= <<>> ->
Root = bksw_conf:disk_store(),
filename:join([Root, BucketEntryPath]).
EP = entry_path_sha(BucketEntryPath),
iolist_to_binary(filename:join([Root, EP])).

entry_path(Bucket, Entry) when Bucket =/= <<>> andalso Entry =/= <<>> ->
Root = bksw_conf:disk_store(),
filename:join([Root, encode(Bucket), encode(Entry)]).
EP = filename:join([Root, encode(Bucket), entry_path_sha(Entry)]),
iolist_to_binary(EP).

entry_path_sha(Path) ->
EncodedPath = encode(Path),
[D11, D12, D21, D22, D31, D32, D41, D42 | _PSHA] = sha_str(EncodedPath),
filename:join([[D11, D12], [D21, D22], [D31, D32], [D41, D42], EncodedPath]).

sha_str(X) ->
sha_to_hex_str(crypto:sha(X)).

sha_to_hex_str(<<SHA:160/big-unsigned-integer>>) ->
lists:flatten(io_lib:format("~40.16.0b", [SHA])).

parse_path(Path) when is_binary(Path) ->
parse_path(binary_to_list(Path));
Expand All @@ -69,37 +81,28 @@ parse_path(Path) when is_list(Path) ->
{entry, decode(Bucket), decode(filename:basename(Path))}
end.

-spec write_path(string() | binary(), string() | binary()) -> binary().
write_path(Bucket, Path) ->
write_path(entry_path(Bucket, Path)).

-spec write_path(string() | binary()) -> string() | binary().
write_path(Entry) when is_binary(Entry) ->
list_to_binary(write_path(binary_to_list(Entry)));
write_path(Entry) when is_list(Entry) ->
Root = bksw_conf:disk_store(),
{T1, T2, T3} = erlang:now(),
FileName = lists:flatten([Entry, io_lib:format(".~p~p~p_bkwbuf", [T1, T2, T3])]),
case filelib:wildcard(FileName) of
[] ->
FileName;
[_] ->
write_path(Entry)
end.

write_path_to_entry(TempName) ->
filename:join([filename:dirname(TempName), filename:rootname(filename:basename(TempName))]).
UniqueExt = io_lib:format(".~p~p~p_bkwbuf", [T1, T2, T3]),
iolist_to_binary([Root, "/", encode(Bucket), "-", sha_str(encode(Path)), UniqueExt]).

-ifdef(TEST).
encode_decode_test() ->
?assertMatch(<<"testing%20123">>, encode(<<"testing 123">>)),
?assertMatch("testing%20123", encode("testing 123")).
?assertEqual(<<"testing%20123">>, encode(<<"testing 123">>)),
?assertEqual("testing%20123", encode("testing 123")).

bucket_path_test() ->
?assertMatch(<<"/tmp/foo">>, bucket_path(<<"foo">>)),
?assertMatch(<<"/tmp/hello%20world">>, bucket_path(<<"hello world">>)).
?assertEqual(<<"/tmp/foo">>, bucket_path(<<"foo">>)),
?assertEqual(<<"/tmp/hello%20world">>, bucket_path(<<"hello world">>)).

entry_path_test() ->
?assertMatch(<<"/tmp/foo/bar">>, entry_path(<<"foo">>, <<"bar">>)),
?assertMatch(<<"/tmp/foo/entry%20path">>, entry_path(<<"foo">>, <<"entry path">>)).
?assertEqual(<<"/tmp/foo/62/cd/b7/02/bar">>,
entry_path(<<"foo">>, <<"bar">>)),

?assertEqual(<<"/tmp/foo/74/a0/4a/95/entry%20path%2Fabc">>,
entry_path(<<"foo">>, <<"entry path/abc">>)).

parse_path_test() ->
?assertMatch({entry, "foo", "test entry"}, parse_path("/tmp/foo/test%20entry")),
Expand Down

0 comments on commit a4bfaa1

Please sign in to comment.