diff --git a/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs b/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs index f8b570313b..ae134d8aeb 100644 --- a/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs +++ b/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs @@ -329,6 +329,7 @@ _isInfixOf a b = unsafePerformIO (Stream.isInfixOf (toStream a) (toStream b)) -- View patterns -------------------------------------------------------------------------------- +-- XXX Change >> to >>> once exposed -- | Return the suffix of the second string if its prefix -- matches the entire first string. -- @@ -381,6 +382,7 @@ _stripPrefix p t = commonPrefixes :: Utf8 -> Utf8 -> Maybe (Utf8,Utf8,Utf8) commonPrefixes = undefined +-- XXX Change >> to >>> once exposed -- | Return the prefix of the second string if its suffix -- matches the entire first string. -- diff --git a/src/Streamly/Internal/Unicode/Utf8/Type.hs b/src/Streamly/Internal/Unicode/Utf8/Type.hs index d7a59efd9f..684229e214 100644 --- a/src/Streamly/Internal/Unicode/Utf8/Type.hs +++ b/src/Streamly/Internal/Unicode/Utf8/Type.hs @@ -194,6 +194,22 @@ instance Show Utf8 where -- Streamly style APIs -------------------------------------------------------------------------------- +-- XXX From the review: +-- +-- This should be implemented as an Unfold m Char Word8 composed with the input +-- of Array.write. For that we would need to implement unfoldMany for folds: +-- +-- > unfoldMany :: Unfold m a b -> Fold m b c -> Fold m a c +-- +-- If we assume the argument fold to be a non-terminating then it should be easy +-- to implement. That is do not handle the done case, just error out in the done +-- case. +-- +-- Once we have that then we can use: +-- +-- > writeGeneric = Fold.unfoldMany readCharUtf8 A.write +-- +-- For readCharUtf8 see https://github.com/composewell/streamly/pull/1055/files {-# INLINE writeGeneric #-} writeGeneric :: forall m. MonadIO m => Unicode.InvalidAction -> Fold m Char Utf8 writeGeneric act = Fold.Fold step initial (return . Utf8 . Array.unsafeFreeze) @@ -322,6 +338,15 @@ read = undefined singleton :: Char -> Utf8 singleton x = pack [x] +-- XXX From the review: +-- +-- StreamD cons would be better here. And we should have a caveat that this +-- function should be avoided to build a big array using this, so you should not +-- be using foldr cons empty that would suck with StreamD cons. But an operation +-- like x cons xs would work much better with StreamD cons compared to regular +-- cons. +-- +-- You can also memcpy if that turns out to be faster than stream. -- | Adds a character to the front of a 'Utf8'. This function is more -- costly than its 'List' counterpart because it requires copying a new array. -- Performs replacement on invalid scalar values. @@ -356,6 +381,14 @@ append (Utf8 a) (Utf8 b) = Utf8 $ unsafePerformIO $ Array.splice a b head :: Utf8 -> Maybe Char head = unsafePerformIO . Stream.head . toStream +-- XXX From the review: +-- +-- We can use a length fold and a single char decoding fold in parallel on the +-- stream. Then we can use a array slice to get the tail of the array using the +-- length returned by the length fold. +-- +-- Alternatively, we could get the head char, find its encoded length and use +-- that to slice the array. -- | Returns the first character and rest of a 'Utf8', or 'Nothing' if -- empty. -- @@ -378,8 +411,15 @@ last = undefined -- /Time complexity:/ O(1) {-# INLINE_NORMAL tail #-} tail :: Utf8 -> Maybe Utf8 -tail = fmap fromStream . unsafePerformIO . Stream.tail . toStream +tail = fmap snd . uncons +-- XXX From the review +-- +-- If we can write a routine to decode utf8 in reverse then we can just decode +-- the last char from the end of the array and then slice it. +-- +-- Otherwise, use last on the stream, get the encoded length of the last char +-- and use that to slice it. -- | Returns all but the last character of a 'Utf8', or 'Nothing' if -- empty. -- @@ -410,6 +450,11 @@ null = Array.null . toArray isSingleton :: Utf8 -> Bool isSingleton = undefined +-- XXX From the review +-- +-- We could possibly determine the length faster by using a custom routine that +-- counts the starting chars from the utf8 encoded bytes without decoding the +-- chars. -- | Returns the number of characters in a 'Utf8'. -- -- /Time complexity:/ O(n)