Skip to content

Commit

Permalink
Do an efficient incremental cache update
Browse files Browse the repository at this point in the history
Thanks to @phadej for the inspiration for this in his comment:
haskell/hackage-server#779 (comment)
  • Loading branch information
snoyberg committed Jul 17, 2018
1 parent 7aecad8 commit 33ef253
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 4 deletions.
53 changes: 49 additions & 4 deletions subs/pantry/src/Pantry/Hackage.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,22 @@ module Pantry.Hackage

import RIO
import Conduit
import Crypto.Hash.Conduit (sinkHash)
import Data.Conduit.Tar
import qualified RIO.Text as T
import Data.Text.Unsafe (unsafeTail)
import qualified RIO.ByteString as B
import qualified RIO.ByteString.Lazy as BL
import Pantry.Types
import Pantry.Storage
import Pantry.StaticSHA256
import Network.URI (parseURI)
import Network.HTTP.Client.TLS (getGlobalManager)
import Data.Time (getCurrentTime)
import RIO.FilePath ((</>))
import qualified Distribution.Text
import Distribution.Types.PackageName (mkPackageName)
import System.IO (SeekMode (..))

import qualified Hackage.Security.Client as HS
import qualified Hackage.Security.Client.Repository.Cache as HS
Expand Down Expand Up @@ -78,20 +81,62 @@ updateHackageIndex = do
HS.HasUpdates -> logInfo "Updated package index downloaded"

withStorage $ do
clearHackageRevisions
populateCache tarball `onException`
-- Alright, here's the story. In theory, we only ever append to
-- a tarball. Therefore, we can store the last place we
-- populated our cache from, and fast forward to that point. But
-- there are two issues with that:
--
-- 1. Hackage may rebase, in which case we need to recalculate
-- everything from the beginning. Unfortunately,
-- hackage-security doesn't let us know when that happens.
--
-- 2. Some paranoia about files on the filesystem getting
-- modified out from under us.
--
-- Therefore, we store both the last read-to index, _and_ the
-- SHA256 of all of the contents until that point. When updating
-- the cache, we calculate the new SHA256 of the whole file, and
-- the SHA256 of the previous read-to point. If the old hashes
-- match, we can do an efficient fast forward. Otherwise, we
-- clear the old cache and repopulate.
minfo <- loadLatestCacheUpdate
(offset, newHash, newSize) <- lift $ withBinaryFile tarball ReadMode $ \h -> do
logInfo "Calculating hashes to check for hackage-security rebases"
newSize <- fromIntegral <$> hFileSize h
(offset, newHash) <-
case minfo of
Nothing -> do
logInfo "No old cache found, populating cache from scratch"
newHash <- runConduit $ sourceHandle h .| sinkHash
pure (0, mkStaticSHA256FromDigest newHash)
Just (oldSize, oldHash) -> do
(oldHash', newHash) <- runConduit $ sourceHandle h .| getZipSink ((,)
<$> ZipSink (takeCE (fromIntegral oldSize) .| sinkHash)
<*> ZipSink sinkHash)
offset <-
if oldHash == mkStaticSHA256FromDigest oldHash'
then oldSize <$ logInfo "Updating preexisting cache, should be quick"
else 0 <$ logInfo "Package index was rebased, forcing a recache"
pure (offset, mkStaticSHA256FromDigest newHash)
pure (offset, newHash, newSize)

when (offset == 0) clearHackageRevisions
populateCache tarball (fromIntegral offset) `onException`
lift (logStickyDone "Failed populating package index cache")
storeCacheUpdate newSize newHash
logStickyDone "Package index cache populated"

-- | Populate the SQLite tables with Hackage index information.
populateCache
:: (HasPantryConfig env, HasLogFunc env)
=> FilePath -- ^ tarball
-> Integer -- ^ where to start processing from
-> ReaderT SqlBackend (RIO env) ()
populateCache fp = do
populateCache fp offset = withBinaryFile fp ReadMode $ \h -> do
lift $ logInfo "Populating package index cache ..."
counter <- newIORef (0 :: Int)
withSourceFile fp $ \src -> runConduit $ src .| untar (perFile counter)
hSeek h AbsoluteSeek offset
runConduit $ sourceHandle h .| untar (perFile counter)
where

perFile counter fi
Expand Down
35 changes: 35 additions & 0 deletions subs/pantry/src/Pantry/Storage.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ module Pantry.Storage
, storeHackageRevision
, loadHackagePackageVersions
, loadHackageCabalFile
, loadLatestCacheUpdate
, storeCacheUpdate
-- avoid warnings
, BlobTableId
, HackageId
Expand All @@ -29,6 +31,7 @@ import Database.Persist.TH
import RIO.Orphans ()
import Pantry.StaticSHA256
import qualified RIO.Map as Map
import RIO.Time (UTCTime, getCurrentTime)

share [mkPersist sqlSettings, mkMigrate "migrateAll"] [persistLowerCase|
BlobTable sql=blob
Expand All @@ -47,6 +50,10 @@ Hackage
revision Word
cabal BlobTableId
UniqueHackage name version revision
CacheUpdate
time UTCTime
size Word
hash StaticSHA256
|]

initStorage
Expand Down Expand Up @@ -157,3 +164,31 @@ loadHackageCabalFile name version cfi = do
withHackEnt = traverse $ \(Entity _ h) -> do
Just blob <- get $ hackageCabal h
pure $ blobTableContents blob

{-
CacheUpdate
time UTCTime
size Word
hash StaticSHA256
-}

loadLatestCacheUpdate
:: (HasPantryConfig env, HasLogFunc env)
=> ReaderT SqlBackend (RIO env) (Maybe (Word, StaticSHA256))
loadLatestCacheUpdate =
fmap go <$> selectFirst [] [Desc CacheUpdateTime]
where
go (Entity _ cu) = (cacheUpdateSize cu, cacheUpdateHash cu)

storeCacheUpdate
:: (HasPantryConfig env, HasLogFunc env)
=> Word
-> StaticSHA256
-> ReaderT SqlBackend (RIO env) ()
storeCacheUpdate size hash' = do
now <- getCurrentTime
insert_ CacheUpdate
{ cacheUpdateTime = now
, cacheUpdateSize = size
, cacheUpdateHash = hash'
}

0 comments on commit 33ef253

Please sign in to comment.