Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read dir performance improvements #2642

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
129 changes: 84 additions & 45 deletions core/src/Streamly/Internal/FileSystem/Dir.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ module Streamly.Internal.FileSystem.Dir
, readDirs
, readEither
, readEitherPaths
, readEitherChunks
, _readEitherChunks
, readEitherByteChunks

-- We can implement this in terms of readAttrsRecursive without losing
-- perf.
Expand Down Expand Up @@ -86,22 +89,22 @@ import Control.Monad.Catch (MonadCatch)
import Control.Monad.IO.Class (MonadIO(..))
import Data.Bifunctor (bimap)
import Data.Either (isRight, isLeft, fromLeft, fromRight)
import Data.Function ((&))
import Streamly.Data.Stream (Stream)
import Streamly.Internal.Data.Unfold (Step(..))
import Streamly.Internal.Data.Unfold.Type (Unfold(..))
import System.FilePath ((</>))
#if (defined linux_HOST_OS) || (defined darwin_HOST_OS)
import System.Posix (DirStream, openDirStream, readDirStream, closeDirStream)
#elif defined(mingw32_HOST_OS)
import Streamly.Internal.FileSystem.Path (Path)
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
import qualified System.Win32 as Win32
#else
#error "Unsupported architecture"
import Streamly.Internal.FileSystem.ReadDir
(openDirStream, readDirStreamEither, readEitherChunks, readEitherByteChunks)
import System.Posix.Directory (DirStream, closeDirStream)
#endif
import qualified Streamly.Internal.Data.Fold as Fold
import qualified Streamly.Internal.Data.Stream as S
import qualified Streamly.Data.Unfold as UF
import qualified Streamly.Internal.Data.Unfold as UF (mapM2, bracketIO)
import qualified Streamly.Data.Stream as S
import qualified System.Directory as Dir
import qualified Streamly.Internal.FileSystem.Path as Path

import Prelude hiding (read)

Expand Down Expand Up @@ -239,29 +242,32 @@ toStreamWithBufferOf chunkSize h = AS.concat $ toChunksWithBufferOf chunkSize h

-- XXX exception handling

#if (defined linux_HOST_OS) || (defined darwin_HOST_OS)
#if !defined(mingw32_HOST_OS) && !defined(__MINGW32__)
{-# INLINE streamReader #-}
streamReader :: MonadIO m => Unfold m DirStream FilePath
streamReader = Unfold step return
streamReader :: MonadIO m => Unfold m DirStream Path
streamReader = fmap (either id id) streamEitherReader

{-# INLINE streamEitherReader #-}
streamEitherReader :: MonadIO m =>
Unfold m DirStream (Either Path Path)
streamEitherReader = Unfold step return
where

step strm = do
-- XXX Use readDirStreamMaybe
file <- liftIO $ readDirStream strm
case file of
[] -> return Stop
_ -> return $ Yield file strm
r <- liftIO $ readDirStreamEither strm
case r of
Nothing -> return Stop
Just x -> return $ Yield x strm

#elif defined(mingw32_HOST_OS)
#else
openDirStream :: String -> IO (Win32.HANDLE, Win32.FindData)
openDirStream = Win32.findFirstFile

closeDirStream :: (Win32.HANDLE, Win32.FindData) -> IO ()
closeDirStream (h, _) = Win32.findClose h

{-# INLINE streamReader #-}
streamReader :: MonadIO m => Unfold m (Win32.HANDLE, Win32.FindData) FilePath
streamReader :: MonadIO m => Unfold m (Win32.HANDLE, Win32.FindData) Path
streamReader = Unfold step return

where
Expand All @@ -270,25 +276,25 @@ streamReader = Unfold step return
more <- liftIO $ Win32.findNextFile h fdat
if more
then do
file <- liftIO $ Win32.getFindDataFileName fdat
return $ Yield file (h, fdat)
filepath <- liftIO $ Win32.getFindDataFileName fdat
filename <- Path.fromString filepath
return $ Yield filename (h, fdat)
else return Stop
#endif

-- | Read a directory emitting a stream with names of the children. Filter out
-- "." and ".." entries.
--
-- /Internal/
--

{-# INLINE reader #-}
reader :: (MonadIO m, MonadCatch m) => Unfold m FilePath FilePath
reader :: (MonadIO m, MonadCatch m) => Unfold m Path Path
reader =
-- XXX Instead of using bracketIO for each iteration of the loop we should
-- instead yield a buffer of dir entries in each iteration and then use an
-- unfold and concat to flatten those entries. That should improve the
-- performance.
UF.bracketIO openDirStream closeDirStream streamReader
& UF.filter (\x -> x /= "." && x /= "..")

-- XXX We can use a more general mechanism to filter the contents of a
-- directory. We can just stat each child and pass on the stat information. We
Expand All @@ -301,92 +307,125 @@ reader =
-- /Internal/
--
{-# INLINE eitherReader #-}
eitherReader :: (MonadIO m, MonadCatch m) => Unfold m FilePath (Either FilePath FilePath)
eitherReader = UF.mapM2 classify reader

where

classify dir x = do
r <- liftIO $ Dir.doesDirectoryExist (dir ++ "/" ++ x)
return $ if r then Left x else Right x
eitherReader :: (MonadIO m, MonadCatch m) =>
Unfold m Path (Either Path Path)
eitherReader =
-- XXX The measured overhead of bracketIO is not noticeable, if it turns
-- out to be a problems for small filenames we can use getdents64 to use
-- chunked read to avoid the overhead.
UF.bracketIO openDirStream closeDirStream streamEitherReader

{-# INLINE eitherReaderPaths #-}
eitherReaderPaths ::(MonadIO m, MonadCatch m) => Unfold m FilePath (Either FilePath FilePath)
eitherReaderPaths ::(MonadIO m, MonadCatch m) => Unfold m Path (Either Path Path)
eitherReaderPaths =
UF.mapM2 (\dir -> return . bimap (dir </>) (dir </>)) eitherReader
let (</>) = Path.extendPath
in UF.mapM2 (\dir -> return . bimap (dir </>) (dir </>)) eitherReader

--
-- | Read files only.
--
-- /Internal/
--
{-# INLINE fileReader #-}
fileReader :: (MonadIO m, MonadCatch m) => Unfold m FilePath FilePath
fileReader :: (MonadIO m, MonadCatch m) => Unfold m Path Path
fileReader = fmap (fromRight undefined) $ UF.filter isRight eitherReader

-- | Read directories only. Filter out "." and ".." entries.
--
-- /Internal/
--
{-# INLINE dirReader #-}
dirReader :: (MonadIO m, MonadCatch m) => Unfold m FilePath FilePath
dirReader :: (MonadIO m, MonadCatch m) => Unfold m Path Path
dirReader = fmap (fromLeft undefined) $ UF.filter isLeft eitherReader

-- | Raw read of a directory.
--
-- /Pre-release/
{-# INLINE read #-}
read :: (MonadIO m, MonadCatch m) => FilePath -> Stream m FilePath
read :: (MonadIO m, MonadCatch m) => Path -> Stream m Path
read = S.unfold reader

{-# DEPRECATED toStream "Please use 'read' instead" #-}
{-# INLINE toStream #-}
toStream :: (MonadIO m, MonadCatch m) => String -> Stream m String
toStream :: (MonadIO m, MonadCatch m) => Path -> Stream m Path
toStream = read

-- | Read directories as Left and files as Right. Filter out "." and ".."
-- entries. The output contains the names of the directories and files.
--
-- /Pre-release/
{-# INLINE readEither #-}
readEither :: (MonadIO m, MonadCatch m) => FilePath -> Stream m (Either FilePath FilePath)
readEither :: (MonadIO m, MonadCatch m) => Path -> Stream m (Either Path Path)
readEither = S.unfold eitherReader

-- | Like 'readEither' but prefix the names of the files and directories with
-- the supplied directory path.
{-# INLINE readEitherPaths #-}
readEitherPaths :: (MonadIO m, MonadCatch m) => FilePath -> Stream m (Either FilePath FilePath)
readEitherPaths dir = fmap (bimap (dir </>) (dir </>)) $ readEither dir
readEitherPaths :: (MonadIO m, MonadCatch m) => Path -> Stream m (Either Path Path)
readEitherPaths dir =
let (</>) = Path.extendPath
in fmap (bimap (dir </>) (dir </>)) $ readEither dir

-- XXX For a fast custom implementation of traversal, the Right could be the
-- final array chunk including all files and dirs to be written to IO. The Left
-- could be list of dirs to be traversed.
{-# INLINE _readEitherChunks #-}
_readEitherChunks :: (MonadIO m, MonadCatch m) => [Path] -> Stream m (Either [Path] [Path])
_readEitherChunks dirs =
-- XXX Need to use a take to limit the group size. There will be separate
-- limits for dir and files groups.
S.groupsWhile grouper collector
$ S.unfoldMany eitherReaderPaths
$ S.fromList dirs

where

-- XXX We can use a refold "Either dirs files" and yield the one that fills
-- and pass the remainder to the next Refold.
grouper first next =
case first of
Left _ -> isLeft next
Right _ -> isRight next

collector = Fold.foldl' step (Right [])

step b x =
case x of
Left x1 ->
case b of
Right _ -> Left [x1] -- initial
_ -> either (\xs -> Left (x1:xs)) Right b
Right x1 -> fmap (x1:) b

{-# DEPRECATED toEither "Please use 'readEither' instead" #-}
{-# INLINE toEither #-}
toEither :: (MonadIO m, MonadCatch m) => FilePath -> Stream m (Either FilePath FilePath)
toEither :: (MonadIO m, MonadCatch m) => Path -> Stream m (Either Path Path)
toEither = readEither

-- | Read files only.
--
-- /Internal/
--
{-# INLINE readFiles #-}
readFiles :: (MonadIO m, MonadCatch m) => FilePath -> Stream m FilePath
readFiles :: (MonadIO m, MonadCatch m) => Path -> Stream m Path
readFiles = S.unfold fileReader

{-# DEPRECATED toFiles "Please use 'readFiles' instead" #-}
{-# INLINE toFiles #-}
toFiles :: (MonadIO m, MonadCatch m) => FilePath -> Stream m FilePath
toFiles :: (MonadIO m, MonadCatch m) => Path -> Stream m Path
toFiles = readFiles

-- | Read directories only.
--
-- /Internal/
--
{-# INLINE readDirs #-}
readDirs :: (MonadIO m, MonadCatch m) => FilePath -> Stream m FilePath
readDirs :: (MonadIO m, MonadCatch m) => Path -> Stream m Path
readDirs = S.unfold dirReader

{-# DEPRECATED toDirs "Please use 'readDirs' instead" #-}
{-# INLINE toDirs #-}
toDirs :: (MonadIO m, MonadCatch m) => String -> Stream m String
toDirs :: (MonadIO m, MonadCatch m) => Path -> Stream m Path
toDirs = readDirs

{-
Expand Down
2 changes: 2 additions & 0 deletions core/src/Streamly/Internal/FileSystem/Path.hs
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,8 @@ isSeparator = (== '/')
-- components in which case we cannot distinguish an absolute path from
-- relative.

-- XXX This can be generalized to an Array intersperse operation

-- | Like 'extendDir' but for the less restrictive 'Path' type which will always
-- create a syntactically valid 'Path' type but it may not be semantically valid
-- because we may append an absolute path or we may append to a file path.
Expand Down