[hack] Track the lockfile across renames / deletes

Summary: We've been having some problems with multiple hh_servers running on the same directory, and I think this may be the problem. Since Unix file locks are associated with file descriptors, not actual files, we have the following failure mode: 1. tmpwatch deletes the lockfile 2. hh_server 1 thinks it still holds the lock, since the file descriptor to the deleted file still exists 3. hh_server 2 starts up, creates a new lockfile with the same name, and successfully locks the file Additionally, the previous code would periodically check if the lockfile could be locked, and only re-grab the lock if it finds that it can no longer lock it. I assume the intent of the code was to check if we actually owned the lockfile, not whether we _could_ own it, since that doesn't make sense at all. Reviewed By: @dlreeves Differential Revision: D2251973
facebook · Jul 16, 2015 · ba11b86 · ba11b86
1 parent 4d0555e
commit ba11b86
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 15 deletions.
diff --git a/hack/server/serverFunctors.ml b/hack/server/serverFunctors.ml
@@ -172,14 +172,10 @@ end = struct
     let root = ServerArgs.root genv.options in
     let env = ref env in
     while true do
-      if not (Lock.check root "lock") then begin
-        Hh_logger.log "Lost %s lock; reacquiring.\n" Program.name;
-        HackEventLogger.lock_lost root "lock";
-        if not (Lock.grab root "lock")
-        then
-          Hh_logger.log "Failed to reacquire lock; terminating.\n";
-          HackEventLogger.lock_stolen root "lock";
-          die()
+      if not (Lock.grab root "lock") then begin
+        Hh_logger.log "Lost lock; terminating.\n%!";
+        HackEventLogger.lock_stolen root "lock";
+        die()
       end;
       ServerPeriodical.call_before_sleeping();
       let has_client = sleep_and_check socket in

diff --git a/hack/stubs/hackEventLogger.ml b/hack/stubs/hackEventLogger.ml
@@ -16,7 +16,6 @@ let load_script_done _ = ()
 let load_failed _ = ()
 let out_of_date _ = ()
 let killed _ = ()
-let lock_lost _ _ = ()
 let lock_stolen _ _ = ()
 let client_startup _ = ()
 let client_begin_work _ = ()

diff --git a/hack/utils/lock.ml b/hack/utils/lock.ml
@@ -25,6 +25,13 @@ let lock_name root file =
     let root_part = Path.slash_escaped_string_of_path root in
     Printf.sprintf "%s/%s.%s" tmp_dir root_part file
 
+let register_lock lock_file =
+  Sys_utils.with_umask 0o111 begin fun () ->
+    let fd = Unix.descr_of_out_channel (open_out lock_file) in
+    lock_fds := SMap.add lock_file fd !lock_fds;
+    fd
+  end
+
 (**
  * Grab or check if a file lock is available.
  *
@@ -34,13 +41,29 @@ let _operations root op file : bool =
   try
     let lock_file = lock_name root file in
     let fd = match SMap.get lock_file !lock_fds with
-      | None ->
-          Sys_utils.with_umask 0o111 begin fun () ->
-            let fd = Unix.descr_of_out_channel (open_out lock_file) in
-            lock_fds := SMap.add lock_file fd !lock_fds;
+      | None -> register_lock lock_file
+      | Some fd ->
+          let st = Unix.fstat fd in
+          let identical_file =
+            try
+              (* Note: I'm carefully avoiding opening another fd to the
+               * lock_file when doing this check, because closing any file
+               * descriptor to a given file will release the locks on *all*
+               * file descriptors that point to that file. Fortunately, stat()
+               * gets us our information without opening a fd *)
+              let current_st = Unix.stat lock_file in
+              Unix.(st.st_dev = current_st.st_dev &&
+                st.st_ino = current_st.st_ino)
+            with _ ->
+              false
+          in
+          if not identical_file then begin
+            (* Looks like someone (tmpwatch?) deleted the lock file; just
+             * create another one *)
+            Unix.close fd;
+            register_lock lock_file
+          end else
             fd
-          end
-      | Some fd -> fd
     in
     let _ = Unix.lockf fd op 1 in
     true