Skip to content

Commit

Permalink
Take a write lock before using Pantry database #4471
Browse files Browse the repository at this point in the history
This makes two basic changes to the code:

* Avoids a connection pool, which forces the SQLite connection to be
closed at the end of each call to withStorage. This seems unfortunate;
we would like to be able to maintain connections. However, SQLite seems
to view such connections as holding a write lock even when not in the
middle of a database transaction.

* Uses an explicit file lock on the two longest-running write
operations: migration and Hackage population. Ideally the locking
mechanism in SQLite would be sufficient for this, but (1) Hackage
population can take a long time, and (2) evidence has shown that it
doesn't behave as we expect it to.

This may be insufficient for preventing the bug described in #4471, in
which case we may need to apply withWriteLock to more calls. However,
I'd like to start off small to avoid the additional overhead.
  • Loading branch information
snoyberg committed Apr 5, 2019
1 parent f1e1ef5 commit 39d526f
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 27 deletions.
2 changes: 1 addition & 1 deletion src/Stack/Storage.hs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ withStorage ::
=> ReaderT SqlBackend (RIO env) a
-> RIO env a
withStorage inner =
SQLite.withStorage inner =<< view (configL . to configStorage)
flip SQLite.withStorage_ inner =<< view (configL . to configStorage)

-- | Key used to retrieve configuration or flag cache
type ConfigCacheKey = Unique ConfigCacheParent
Expand Down
2 changes: 1 addition & 1 deletion subs/pantry/package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ dependencies:
- persistent
- persistent-sqlite >= 2.8.2
- persistent-template
- resource-pool
- Cabal >= 2.4
- path-io
- rio-orphans
Expand All @@ -62,6 +61,7 @@ dependencies:
- resourcet
- rio-prettyprint
- mtl
- filelock

# FIXME remove when we drop store
- integer-gmp
Expand Down
10 changes: 9 additions & 1 deletion subs/pantry/src/Pantry/Hackage.hs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ updateHackageIndex
:: (HasPantryConfig env, HasLogFunc env)
=> Maybe Utf8Builder -- ^ reason for updating, if any
-> RIO env DidUpdateOccur
updateHackageIndex mreason = gateUpdate $ do
updateHackageIndex mreason = do
storage <- view $ pantryConfigL.to pcStorage
gateUpdate $ withWriteLock_ storage $ do
for_ mreason logInfo
pc <- view pantryConfigL
let HackageSecurityConfig keyIds threshold url ignoreExpiry = pcHackageSecurity pc
Expand Down Expand Up @@ -121,6 +123,12 @@ updateHackageIndex mreason = gateUpdate $ do
updateCache tarball
logStickyDone "Package index cache populated"
where
-- This is the one action in the Pantry codebase known to hold a
-- write lock on the database for an extended period of time. To
-- avoid failures due to SQLite locks failing, we take our own
-- lock outside of SQLite for this action.
--
-- See https://github.com/commercialhaskell/stack/issues/4471
updateCache tarball = withStorage $ do
-- Alright, here's the story. In theory, we only ever append to
-- a tarball. Therefore, we can store the last place we
Expand Down
88 changes: 67 additions & 21 deletions subs/pantry/src/Pantry/SQLite.hs
Original file line number Diff line number Diff line change
@@ -1,47 +1,93 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Pantry.SQLite
( P.Storage
( Storage (..)
, initStorage
, withStorage
) where

import RIO hiding (FilePath)
import qualified Pantry.Types as P
import Database.Persist.Sqlite
import RIO.Orphans ()
import Path (Path, Abs, File, toFilePath, parent)
import Path.IO (ensureDir)
import Data.Pool (destroyAllResources)
import Pantry.Types (PantryException (MigrationFailure))
import Pantry.Types (PantryException (MigrationFailure), Storage (..))
import System.FileLock (withFileLock, withTryFileLock, SharedExclusive (..))

initStorage
:: HasLogFunc env
=> Text
-> Migration
-> Path Abs File -- ^ storage file
-> (P.Storage -> RIO env a)
-> (Storage -> RIO env a)
-> RIO env a
initStorage description migration fp inner = do
ensureDir $ parent fp
bracket
(createSqlitePoolFromInfo (sqinfo False) 1)
(liftIO . destroyAllResources) $ \pool -> do
migrates <- wrapMigrationFailure $ runSqlPool (runMigrationSilent migration) pool
forM_ migrates $ \mig -> logDebug $ "Migration executed: " <> display mig
bracket
(createSqlitePoolFromInfo (sqinfo True) 1)
(liftIO . destroyAllResources) $ \pool -> inner (P.Storage pool)

migrates <- withWriteLock fp $ wrapMigrationFailure $
withSqliteConnInfo (sqinfo True) $ runReaderT $
runMigrationSilent migration
forM_ migrates $ \mig -> logDebug $ "Migration executed: " <> display mig

inner $ Storage
{ withStorage_ = withSqliteConnInfo (sqinfo False) . runSqlConn
, withWriteLock_ = withWriteLock fp
}
where
wrapMigrationFailure = handleAny (throwIO . MigrationFailure description fp)
sqinfo fk = set extraPragmas ["PRAGMA busy_timeout=2000;"]
$ set fkEnabled fk

sqinfo isMigration
= set extraPragmas ["PRAGMA busy_timeout=2000;"]

-- When doing a migration, we want to disable foreign key
-- checking, since the order in which tables are created by
-- the migration scripts may not respect foreign keys. The
-- rest of the time: enforce those foreign keys.
$ set fkEnabled (not isMigration)

-- This one is subtle. Enabling Write-Ahead Logging (WAL)
-- mode is persistent: the next usage of the database is
-- still in WAL mode until explicitly disabled. Therefore,
-- when we're already migrated the database, there's no
-- need to waste time reenabling WAL. Skipping this also
-- allows us to get slightly more meaningful error messages
-- if we run into the SQLITE_BUSY bug again.
$ set walEnabled isMigration

$ mkSqliteConnectionInfo (fromString $ toFilePath fp)

withStorage
-- | Ensure that only one process is trying to write to the database
-- at a time. See
-- https://github.com/commercialhaskell/stack/issues/4471 and comments
-- above.
withWriteLock
:: HasLogFunc env
=> ReaderT SqlBackend (RIO env) a
-> P.Storage
=> Path Abs File -- ^ SQLite database file
-> RIO env a
-> RIO env a
withStorage action (P.Storage pool) =
runSqlPool action pool
withWriteLock dbFile inner = do
let lockFile = toFilePath dbFile ++ ".pantry-write-lock"
withRunInIO $ \run -> do
mres <- withTryFileLock lockFile Exclusive $ const $ run inner
case mres of
Just res -> pure res
Nothing -> do
run $ logInfo "Unable to get a write lock on the Pantry database, waiting..."
shouldStopComplainingVar <- newTVarIO False
let complainer = fix $ \loop -> do
delay <- registerDelay $ 60 * 1000 * 1000 -- 1 minute
shouldComplain <-
atomically $
-- Delay has triggered, time to complain again
(readTVar delay >>= checkSTM >> pure True) <|>
-- Time to stop complaining, ignore that delay immediately
(readTVar shouldStopComplainingVar >>= checkSTM >> pure False)
when shouldComplain $ do
run $ logWarn "Still waiting on the Pantry database write lock..."
loop
stopComplaining = atomically $ writeTVar shouldStopComplainingVar True
worker = withFileLock lockFile Exclusive $ const $ do
run $ logInfo "Acquired the Pantry database write lock"
stopComplaining
run inner
runConcurrently $ Concurrently complainer
*> Concurrently (worker `finally` stopComplaining)
2 changes: 1 addition & 1 deletion subs/pantry/src/Pantry/Storage.hs
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ withStorage
=> ReaderT SqlBackend (RIO env) a
-> RIO env a
withStorage action =
SQLite.withStorage action =<< view (P.pantryConfigL.to P.pcStorage)
flip SQLite.withStorage_ action =<< view (P.pantryConfigL.to P.pcStorage)

getPackageNameId
:: (HasPantryConfig env, HasLogFunc env)
Expand Down
14 changes: 12 additions & 2 deletions subs/pantry/src/Pantry/Types.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
Expand Down Expand Up @@ -142,7 +143,6 @@ import Network.HTTP.Types (Status, statusCode)
import Data.Text.Read (decimal)
import Path (Path, Abs, Dir, File, toFilePath, filename, (</>), parseRelFile)
import Path.IO (resolveFile, resolveDir)
import Data.Pool (Pool)
import Data.List.NonEmpty (NonEmpty)
import qualified Data.List.NonEmpty as NE

Expand Down Expand Up @@ -197,7 +197,17 @@ cabalFileName name =
newtype Revision = Revision Word
deriving (Generic, Show, Eq, NFData, Data, Typeable, Ord, Hashable, Display, PersistField, PersistFieldSql)

newtype Storage = Storage (Pool SqlBackend)
-- | Represents a SQL database connection. This used to be a newtype
-- wrapper around a connection pool. However, when investigating
-- <https://github.com/commercialhaskell/stack/issues/4471>, it
-- appeared that holding a pool resulted in overly long write locks
-- being held on the database. As a result, we now abstract away
-- whether a pool is used, and the default implementation in
-- "Pantry.Storage" does not use a pool.
data Storage = Storage
{ withStorage_ :: (forall env a. HasLogFunc env => ReaderT SqlBackend (RIO env) a -> RIO env a)
, withWriteLock_ :: (forall env a. HasLogFunc env => RIO env a -> RIO env a)
}

-- | Configuration value used by the entire pantry package. Create one
-- using @withPantryConfig@. See also @PantryApp@ for a convenience
Expand Down

0 comments on commit 39d526f

Please sign in to comment.