Skip to content

Commit

Permalink
[hack] Track the lockfile across renames / deletes
Browse files Browse the repository at this point in the history
Summary: We've been having some problems with multiple hh_servers
running on the same directory, and I think this may be the problem.
Since Unix file locks are associated with file descriptors, not actual
files, we have the following failure mode:

1. tmpwatch deletes the lockfile
2. hh_server 1 thinks it still holds the lock, since the file descriptor
   to the deleted file still exists
3. hh_server 2 starts up, creates a new lockfile with the same name, and
   successfully locks the file

Additionally, the previous code would periodically check if the lockfile
could be locked, and only re-grab the lock if it finds that it can no
longer lock it. I assume the intent of the code was to check if we
actually owned the lockfile, not whether we _could_ own it, since that
doesn't make sense at all.

Reviewed By: @dlreeves

Differential Revision: D2251973
  • Loading branch information
int3 authored and facebook-github-bot-5 committed Jul 16, 2015
1 parent 4d0555e commit ba11b86
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 15 deletions.
12 changes: 4 additions & 8 deletions hack/server/serverFunctors.ml
Expand Up @@ -172,14 +172,10 @@ end = struct
let root = ServerArgs.root genv.options in
let env = ref env in
while true do
if not (Lock.check root "lock") then begin
Hh_logger.log "Lost %s lock; reacquiring.\n" Program.name;
HackEventLogger.lock_lost root "lock";
if not (Lock.grab root "lock")
then
Hh_logger.log "Failed to reacquire lock; terminating.\n";
HackEventLogger.lock_stolen root "lock";
die()
if not (Lock.grab root "lock") then begin
Hh_logger.log "Lost lock; terminating.\n%!";
HackEventLogger.lock_stolen root "lock";
die()
end;
ServerPeriodical.call_before_sleeping();
let has_client = sleep_and_check socket in
Expand Down
1 change: 0 additions & 1 deletion hack/stubs/hackEventLogger.ml
Expand Up @@ -16,7 +16,6 @@ let load_script_done _ = ()
let load_failed _ = ()
let out_of_date _ = ()
let killed _ = ()
let lock_lost _ _ = ()
let lock_stolen _ _ = ()
let client_startup _ = ()
let client_begin_work _ = ()
Expand Down
35 changes: 29 additions & 6 deletions hack/utils/lock.ml
Expand Up @@ -25,6 +25,13 @@ let lock_name root file =
let root_part = Path.slash_escaped_string_of_path root in
Printf.sprintf "%s/%s.%s" tmp_dir root_part file

let register_lock lock_file =
Sys_utils.with_umask 0o111 begin fun () ->
let fd = Unix.descr_of_out_channel (open_out lock_file) in
lock_fds := SMap.add lock_file fd !lock_fds;
fd
end

(**
* Grab or check if a file lock is available.
*
Expand All @@ -34,13 +41,29 @@ let _operations root op file : bool =
try
let lock_file = lock_name root file in
let fd = match SMap.get lock_file !lock_fds with
| None ->
Sys_utils.with_umask 0o111 begin fun () ->
let fd = Unix.descr_of_out_channel (open_out lock_file) in
lock_fds := SMap.add lock_file fd !lock_fds;
| None -> register_lock lock_file
| Some fd ->
let st = Unix.fstat fd in
let identical_file =
try
(* Note: I'm carefully avoiding opening another fd to the
* lock_file when doing this check, because closing any file
* descriptor to a given file will release the locks on *all*
* file descriptors that point to that file. Fortunately, stat()
* gets us our information without opening a fd *)
let current_st = Unix.stat lock_file in
Unix.(st.st_dev = current_st.st_dev &&
st.st_ino = current_st.st_ino)
with _ ->
false
in
if not identical_file then begin
(* Looks like someone (tmpwatch?) deleted the lock file; just
* create another one *)
Unix.close fd;
register_lock lock_file
end else
fd
end
| Some fd -> fd
in
let _ = Unix.lockf fd op 1 in
true
Expand Down

0 comments on commit ba11b86

Please sign in to comment.