diff --git a/src/Streamly/Internal/Data/Array/Foreign.hs b/src/Streamly/Internal/Data/Array/Foreign.hs index e6e7f9746b..403fce33c3 100644 --- a/src/Streamly/Internal/Data/Array/Foreign.hs +++ b/src/Streamly/Internal/Data/Array/Foreign.hs @@ -33,6 +33,7 @@ module Streamly.Internal.Data.Array.Foreign Array -- * Construction + , A.nil -- Pure, From Static Memory (Unsafe) -- We can use fromPtrM#, fromCStringM# and fromAddrM# to create arrays from diff --git a/src/Streamly/Internal/Data/Array/Foreign/Type.hs b/src/Streamly/Internal/Data/Array/Foreign/Type.hs index 6e967f96f9..dfecbd72ca 100644 --- a/src/Streamly/Internal/Data/Array/Foreign/Type.hs +++ b/src/Streamly/Internal/Data/Array/Foreign/Type.hs @@ -21,6 +21,7 @@ module Streamly.Internal.Data.Array.Foreign.Type , unsafeThaw -- * Construction + , nil , splice , fromPtr diff --git a/src/Streamly/Internal/Unicode/Utf8.hs b/src/Streamly/Internal/Unicode/Utf8.hs index 420751ce71..9adc0f51f9 100644 --- a/src/Streamly/Internal/Unicode/Utf8.hs +++ b/src/Streamly/Internal/Unicode/Utf8.hs @@ -6,62 +6,160 @@ -- Stability : experimental -- Portability : GHC -- +-- This module mimics the API of the text package. Some documentation snippets +-- may have been taken from the text package. module Streamly.Internal.Unicode.Utf8 ( - -- * Type - Utf8 - -- * Creation and elimination - , pack - , unpack - , toArray + module Streamly.Internal.Unicode.Utf8.Type + , module Streamly.Internal.Unicode.Utf8.Transform + , module Streamly.Internal.Unicode.Utf8.Eliminate + , module Streamly.Internal.Unicode.Utf8.Generate + , module Streamly.Internal.Unicode.Utf8.Reduce + + -- * Folds + + -- ** Special folds + , concat + , concatMap + + -- * Substrings + + -- ** Breaking into lines and words + , lines + --, lines' + , words + , unlines + , unwords + + -- * Zipping + , zip + , zipWith + + -- -* Ordered + -- , sort + + -- -- * Low level operations + -- , copy + -- , unpackCString# + ) where +#include "inline.hs" + -------------------------------------------------------------------------------- -- Imports -------------------------------------------------------------------------------- -import Control.DeepSeq (NFData) -import Data.Word (Word8) -import Streamly.Internal.Data.Array.Foreign.Type (Array) +import Data.Char (isSpace) import System.IO.Unsafe (unsafePerformIO) -import qualified Streamly.Internal.Data.Array.Foreign as Array +import qualified Data.List as List import qualified Streamly.Internal.Data.Stream.IsStream as Stream -import qualified Streamly.Internal.Unicode.Stream as Unicode + +import Streamly.Internal.Unicode.Utf8.Type +import Streamly.Internal.Unicode.Utf8.Transform +import Streamly.Internal.Unicode.Utf8.Eliminate +import Streamly.Internal.Unicode.Utf8.Generate +import Streamly.Internal.Unicode.Utf8.Reduce + +import Prelude hiding + ( concat + , concatMap + , foldr + , lines + , null + , unlines + , unwords + , words + , zip + , zipWith + ) + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8 -------------------------------------------------------------------------------- --- Type +-- Special folds -------------------------------------------------------------------------------- --- | A space efficient, packed, unboxed Unicode container. -newtype Utf8 = - Utf8 (Array Word8) - deriving (NFData) +-- XXX We should write these APIs generalized on Array a and then just use those +-- for the Utf8 type. The generalized APIs would be more useful, they can go in +-- the Array module itself and can be used generally for arrays, you won't need +-- to transform arrays into stream and then back for such common operations. +-- | Concatenate a list of 'Utf8's. +-- +-- /Time complexity:/ O(n) +{-# INLINE concat #-} +concat :: [Utf8] -> Utf8 +concat ts = + case Prelude.filter (not . null) ts of + [] -> empty + [t] -> t + xs -> Prelude.foldl1 append xs + +-- | Map a function over a 'Utf8' that results in a 'Utf8', and +-- concatenate the results. +-- +-- /Time complexity:/ O(n) +{-# INLINE concatMap #-} +concatMap :: (Char -> Utf8) -> Utf8 -> Utf8 +concatMap f = concat . foldr ((:) . f) [] -------------------------------------------------------------------------------- --- Functions +-- Zipping -------------------------------------------------------------------------------- -{-# INLINE toArray #-} -toArray :: Utf8 -> Array Word8 -toArray (Utf8 arr) = arr +-- | 'zip' takes two 'Utf8's and returns a list of +-- corresponding pairs of bytes. If one input 'Utf8' is short, +-- excess elements of the longer 'Utf8' are discarded. This is +-- equivalent to a pair of 'unpack' operations. +-- +-- /Time complexity:/ O(n) +{-# INLINE zip #-} +zip :: Utf8 -> Utf8 -> [(Char,Char)] +zip a b = + unsafePerformIO + $ Stream.toList $ Stream.zipWith (,) (toStream a) (toStream b) +-- | 'zipWith' generalises 'zip' by zipping with the function +-- given as the first argument, instead of a tupling function. +-- Performs replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +{-# INLINE zipWith #-} +zipWith :: (Char -> Char -> Char) -> Utf8 -> Utf8 -> Utf8 +zipWith f a b = fromStream (Stream.zipWith f (toStream a) (toStream b)) -{-# INLINEABLE pack #-} -pack :: String -> Utf8 -pack s = - Utf8 - $ unsafePerformIO - $ Array.fromStreamN len $ Unicode.encodeUtf8' $ Stream.fromList s +-- | Breaks a 'Utf8' up into a list of words, delimited by 'Char's +-- representing white space. +-- +-- /Time complexity:/ O(n) +{-# INLINE words #-} +words :: Utf8 -> [Utf8] +words = split isSpace - where +-- | Breaks a 'Utf8' up into a list of 'Utf8's at +-- newline 'Char's. The resulting strings do not contain newlines. +-- +-- /Time complexity:/ O(n) +{-# INLINE lines #-} +lines :: Utf8 -> [Utf8] +lines = split (== '\n') - len = length s +-- | Joins lines, after appending a terminating newline to +-- each. +-- +-- /Time complexity:/ O(n) +{-# INLINE unlines #-} +unlines :: [Utf8] -> Utf8 +unlines = concat . List.map (`snoc` '\n') -{-# INLINEABLE unpack #-} -unpack :: Utf8 -> String -unpack u = - unsafePerformIO - $ Stream.toList $ Unicode.decodeUtf8' $ Array.toStream $ toArray u +-- | Joins words using single space characters. +-- +-- /Time complexity:/ O(n) +{-# INLINE unwords #-} +unwords :: [Utf8] -> Utf8 +unwords = intercalate (singleton ' ') diff --git a/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs b/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs new file mode 100644 index 0000000000..9f84c91707 --- /dev/null +++ b/src/Streamly/Internal/Unicode/Utf8/Eliminate.hs @@ -0,0 +1,368 @@ +-- | +-- Module : Streamly.Internal.Unicode.Utf8.Eliminate +-- Copyright : (c) 2021 Composewell Technologies +-- License : BSD-3-Clause +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- Portability : GHC +-- +module Streamly.Internal.Unicode.Utf8.Eliminate + ( + -- * Folds + fold + , foldl + , foldl' + , foldl1 + , foldl1' + , foldr + , foldr1 + + -- ** Special folds + , any + , all + , maximum + , minimum + + -- * Predicates + + -- XXX Comment our further stuff to help with the GHCJS compilation + -- , isPrefixOf + -- , isSuffixOf + -- XXX isInfixOf takes too much memory to compile + -- , isInfixOf + + -- ** View patterns + + -- XXX Comment our further stuff to help with the GHCJS compilation + -- , stripPrefix + -- , stripSuffix + , commonPrefixes + + -- * Searching + , find + , elem + + -- * Indexing + -- $index + , index + , findIndex + , countChar + , count + ) +where + +#include "inline.hs" + +-------------------------------------------------------------------------------- +-- Imports +-------------------------------------------------------------------------------- + +import Control.Monad.IO.Class (MonadIO(..)) +import Streamly.Internal.Data.Fold (Fold) +import Streamly.Internal.Data.Stream.IsStream.Lift (hoist) +import System.IO.Unsafe (unsafePerformIO) + +import qualified Streamly.Internal.Data.Array.Foreign.Type as Array +import qualified Streamly.Internal.Data.Stream.IsStream as Stream + +import Streamly.Internal.Unicode.Utf8.Type + +import Prelude hiding + ( all + , any + , elem + , foldl + , foldl1 + , foldr + , foldr1 + , maximum + , minimum + ) + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8 + +-------------------------------------------------------------------------------- +-- Reducing Streams (folds) +-------------------------------------------------------------------------------- + +-- XXX Try removing MonadIO constraint here +-- +-- See this: https://github.com/composewell/streamly/issues/1457 +-- We can probably work on remving the MonadIO constraint after. +-- +{-# INLINE fold #-} +fold :: MonadIO m => Fold m Char b -> Utf8 -> m b +fold f = Stream.fold f . hoist liftIO . toStream + +-- | 'foldl', applied to a binary operator, a starting value +-- (typically the left-identity of the operator), and a 'Utf8', +-- +-- /Time complexity:/ O(n) +-- reduces the 'Utf8' using the binary operator, from left to right. +-- +-- /Unimplemented/ +{-# INLINE foldl #-} +foldl :: (a -> Char -> a) -> a -> Utf8 -> a +foldl = undefined + +-- XXX Try using Identity monad instead of IO +-- +-- See this: https://github.com/composewell/streamly/issues/1457 +-- We can probably work on remving the MonadIO constraint after. +-- +-- | A strict version of 'foldl'. +-- +-- /Time complexity:/ O(n) +{-# INLINE foldl' #-} +foldl' :: (a -> Char -> a) -> a -> Utf8 -> a +foldl' f z t = unsafePerformIO $ Stream.foldl' f z (toStream t) + +-- | A variant of 'foldl' that has no starting value argument. Returns +-- 'Nothing' if applied to an empty 'Utf8'. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE foldl1 #-} +foldl1 :: (Char -> Char -> Char) -> Utf8 -> Char +foldl1 = undefined + +-- | A strict version of 'foldl1'. +-- +-- /Time complexity:/ O(n) +{-# INLINE foldl1' #-} +foldl1' :: (Char -> Char -> Char) -> Utf8 -> Maybe Char +foldl1' f t = unsafePerformIO $ Stream.foldl1' f (toStream t) + +-- | 'foldr', applied to a binary operator, a starting value +-- (typically the right-identity of the operator), and a 'Utf8', +-- reduces the 'Utf8' using the binary operator, from right to left. +-- +-- /Time complexity:/ O(n) +{-# INLINE foldr #-} +foldr :: (Char -> a -> a) -> a -> Utf8 -> a +foldr f z t = unsafePerformIO $ Stream.foldr f z (toStream t) + +-- | A variant of 'foldr' that has no starting value argument. Returns +-- 'Nothing' if applied to an empty 'Utf8'. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE foldr1 #-} +foldr1 :: (Char -> Char -> Char) -> Utf8 -> Maybe Char +foldr1 = undefined + +-------------------------------------------------------------------------------- +-- Special folds +-------------------------------------------------------------------------------- + +-- | 'any' @p@ @t@ determines whether any character in the +-- 'Utf8' @t@ satisfies the predicate @p@. +-- +-- /Time complexity:/ O(n) +{-# INLINE any #-} +any :: (Char -> Bool) -> Utf8 -> Bool +any p t = unsafePerformIO $ Stream.any p (toStream t) + +-- | 'all' @p@ @t@ determines whether all characters in the +-- 'Utf8' @t@ satisfy the predicate @p@. +-- +-- /Time complexity:/ O(n) +{-# INLINE all #-} +all :: (Char -> Bool) -> Utf8 -> Bool +all p t = unsafePerformIO $ Stream.all p (toStream t) + +-- | 'maximum' returns the maximum value from a 'Utf8', or 'Nothing' if +-- empty. +-- +-- /Time complexity:/ O(n) +{-# INLINE maximum #-} +maximum :: Utf8 -> Maybe Char +maximum t = unsafePerformIO $ Stream.maximum (toStream t) + +-- | 'minimum' returns the minimum value from a 'Utf8', or 'Nothing' if +-- empty. +-- +-- /Time complexity:/ O(n) +{-# INLINE minimum #-} +minimum :: Utf8 -> Maybe Char +minimum t = unsafePerformIO $ Stream.minimum (toStream t) + +-------------------------------------------------------------------------------- +-- Indexing 'Utf8's +-------------------------------------------------------------------------------- + +-- | 'Utf8' index (subscript) operator, starting from 0. +-- +-- /Time complexity:/ O(n) +{-# INLINE index #-} +index :: Utf8 -> Int -> Maybe Char +index t n = unsafePerformIO $ (Stream.!!) (toStream t) n + +-- | The 'findIndex' function takes a predicate and a 'Utf8' +-- and returns the index of the first element in the 'Utf8' satisfying +-- the predicate. +-- +-- /Time complexity:/ O(n) +{-# INLINE findIndex #-} +findIndex :: (Char -> Bool) -> Utf8 -> Maybe Int +findIndex p t = unsafePerformIO $ Stream.findIndex p (toStream t) + +-- | The 'count' function returns the number of times the +-- query string appears in the given 'Utf8'. An empty query string is +-- invalid, and will cause an error to be raised. +-- +-- /Time complexity:/ O(n*m) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL count #-} +count :: Utf8 -> Utf8 -> Int +count = undefined + +-- | The 'countChar' function returns the number of times the +-- query element appears in the given 'Utf8'. +-- +-- /Time complexity:/ O(n) +{-# INLINE countChar #-} +countChar :: Char -> Utf8 -> Int +countChar c t = + unsafePerformIO $ Stream.length $ Stream.filter (== c) (toStream t) + +-------------------------------------------------------------------------------- +-- Searching +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +-- Searching with a predicate +-------------------------------------------------------------------------------- + +-- | The 'elem' function takes a character and a 'Utf8', and +-- returns 'True' if the element is found in the given 'Utf8', or +-- 'False' otherwise. +-- +-- /Time complexity:/ O(n) +{-# INLINE elem #-} +elem :: Char -> Utf8 -> Bool +elem c = any (== c) + +-- | The 'find' function takes a predicate and a 'Utf8', and +-- returns the first element matching the predicate, or 'Nothing' if +-- there is no such element. +-- +-- /Time complexity:/ O(n) +{-# INLINE find #-} +find :: (Char -> Bool) -> Utf8 -> Maybe Char +find p t = unsafePerformIO $ Stream.find p (toStream t) + +-------------------------------------------------------------------------------- +-- Predicates +-------------------------------------------------------------------------------- + +-- XXX This function isn't exported as it takes too much time to compile. +-- Need to investigate. +-- | The 'isPrefixOf' function takes two 'Utf8's and returns +-- 'True' iff the first is a prefix of the second. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL _isPrefixOf #-} +_isPrefixOf :: Utf8 -> Utf8 -> Bool +_isPrefixOf a b = + Array.byteLength (toArray a) <= Array.byteLength (toArray b) + && unsafePerformIO (Stream.isPrefixOf (toStream a) (toStream b)) + +-- XXX This function isn't exported as it takes too much time to compile. +-- Need to investigate. +-- | The 'isSuffixOf' function takes two 'Utf8's and returns +-- 'True' iff the first is a suffix of the second. +-- +-- /Time complexity:/ O(n) +{-# INLINE _isSuffixOf #-} +_isSuffixOf :: Utf8 -> Utf8 -> Bool +_isSuffixOf a b = unsafePerformIO (Stream.isSuffixOf (toStream a) (toStream b)) + +-- XXX This specific API uses a lot of memory to compile +-- XXX Use domain specific knowledge to implement it efficiently! +-- | The 'isInfixOf' function takes two 'Utf8's and returns +-- 'True' iff the first is contained, wholly and intact, anywhere +-- within the second. +-- +-- In (unlikely) bad cases, this function's time complexity degrades +-- towards /O(n*m)/. +-- +-- /Time complexity:/ O(n+m) +{-# INLINE_NORMAL _isInfixOf #-} +_isInfixOf :: Utf8 -> Utf8 -> Bool +_isInfixOf a b = unsafePerformIO (Stream.isInfixOf (toStream a) (toStream b)) + +-------------------------------------------------------------------------------- +-- View patterns +-------------------------------------------------------------------------------- + +-- XXX Change >> to >>> once exposed +-- | Return the suffix of the second string if its prefix +-- matches the entire first string. +-- +-- Examples: +-- +-- >> stripPrefix "foo" "foobar" +-- Just "bar" +-- +-- >> stripPrefix "" "baz" +-- Just "baz" +-- +-- >> stripPrefix "foo" "quux" +-- Nothing +-- +-- /Time complexity:/ O(n) +_stripPrefix :: Utf8 -> Utf8 -> Maybe Utf8 +_stripPrefix p t = + fmap fromStream $ unsafePerformIO $ Stream.stripPrefix (toStream p) (toStream t) + +-- XXX Change >> to >>> after implementation +-- | Find the longest non-empty common prefix of two strings +-- and return it, along with the suffixes of each string at which they +-- no longer match. +-- +-- If the strings do not have a common prefix or either one is empty, +-- this function returns 'Nothing'. +-- +-- Examples: +-- +-- >> commonPrefixes "foobar" "fooquux" +-- Just ("foo","bar","quux") +-- +-- >> commonPrefixes "veeble" "fetzer" +-- Nothing +-- +-- >> commonPrefixes "" "baz" +-- Nothing +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +commonPrefixes :: Utf8 -> Utf8 -> Maybe (Utf8, Utf8, Utf8) +commonPrefixes = undefined + +-- XXX Change >> to >>> once exposed +-- | Return the prefix of the second string if its suffix +-- matches the entire first string. +-- +-- Examples: +-- +-- >> stripSuffix "bar" "foobar" +-- Just "foo" +-- +-- >> stripSuffix "" "baz" +-- Just "baz" +-- +-- >> stripSuffix "foo" "quux" +-- Nothing +-- +-- /Time complexity:/ O(n) +_stripSuffix :: Utf8 -> Utf8 -> Maybe Utf8 +_stripSuffix p t = + fmap fromStream + $ unsafePerformIO $ Stream.stripSuffix (toStream p) (toStream t) diff --git a/src/Streamly/Internal/Unicode/Utf8/Generate.hs b/src/Streamly/Internal/Unicode/Utf8/Generate.hs new file mode 100644 index 0000000000..5b124cb8d3 --- /dev/null +++ b/src/Streamly/Internal/Unicode/Utf8/Generate.hs @@ -0,0 +1,114 @@ +-- | +-- Module : Streamly.Internal.Unicode.Utf8.Generate +-- Copyright : (c) 2021 Composewell Technologies +-- License : BSD-3-Clause +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- Portability : GHC +-- +module Streamly.Internal.Unicode.Utf8.Generate + ( + -- ** Accumulating maps + mapAccumL + , mapAccumR + + -- ** Generation and unfolding + , replicateChar + , replicate + , unfoldr + , unfoldrN + ) +where + +#include "inline.hs" + +-------------------------------------------------------------------------------- +-- Imports +-------------------------------------------------------------------------------- + +import qualified Streamly.Internal.Data.Stream.IsStream as Stream + +import Streamly.Internal.Unicode.Utf8.Type + +import Prelude hiding (replicate) + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8 + +-------------------------------------------------------------------------------- +-- Building 'Utf8's +-------------------------------------------------------------------------------- + +-- | Like a combination of 'map' and 'foldl''. Applies a +-- function to each element of a 'Utf8', passing an accumulating +-- parameter from left to right, and returns a final 'Utf8'. Performs +-- replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE mapAccumL #-} +mapAccumL :: (a -> Char -> (a,Char)) -> a -> Utf8 -> (a, Utf8) +mapAccumL = undefined + +-- | The 'mapAccumR' function behaves like a combination of 'map' and +-- a strict 'foldr'; it applies a function to each element of a +-- 'Utf8', passing an accumulating parameter from right to left, and +-- returning a final value of this accumulator together with the new +-- 'Utf8'. +-- Performs replacement on invalid scalar values. +-- +-- /Unimplemented/ +{-# INLINE mapAccumR #-} +mapAccumR :: (a -> Char -> (a,Char)) -> a -> Utf8 -> (a, Utf8) +mapAccumR = undefined + +-------------------------------------------------------------------------------- +-- Generating and unfolding 'Utf8's +-------------------------------------------------------------------------------- + +-- | 'replicate' @n@ @t@ is a 'Utf8' consisting of the input +-- @t@ repeated @n@ times. +-- +-- /Time complexity:/ O(n*m) +-- +-- /Unimplemented/ +{-# INLINE replicate #-} +replicate :: Int -> Utf8 -> Utf8 +replicate = undefined + +-- | 'replicateChar' @n@ @c@ is a 'Utf8' of length @n@ with @c@ the +-- value of every element. +-- +-- /Time complexity:/ O(n) +{-# INLINE replicateChar #-} +replicateChar :: Int -> Char -> Utf8 +replicateChar n c = fromStream (Stream.replicate n c) + +-- | The 'unfoldr' +-- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a +-- 'Utf8' from a seed value. The function takes the element and +-- returns 'Nothing' if it is done producing the 'Utf8', otherwise +-- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the +-- string, and @b@ is the seed value for further production. +-- Performs replacemsent on invalid scalar values. +-- +-- /Time complexity:/ O(n), where @n@ is the length of the result. +{-# INLINE unfoldr #-} +unfoldr :: (a -> Maybe (Char, a)) -> a -> Utf8 +unfoldr f s = fromStream (Stream.unfoldr f s) + +-- | Like 'unfoldr', 'unfoldrN' builds a 'Utf8' from a seed +-- value. However, the length of the result should be limited by the +-- first argument to 'unfoldrN'. This function is more efficient than +-- 'unfoldr' when the maximum length of the result is known and +-- correct, otherwise its performance is similar to 'unfoldr'. +-- Performs replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE unfoldrN #-} +unfoldrN :: Int -> (a -> Maybe (Char, a)) -> a -> Utf8 +unfoldrN = undefined diff --git a/src/Streamly/Internal/Unicode/Utf8/Reduce.hs b/src/Streamly/Internal/Unicode/Utf8/Reduce.hs new file mode 100644 index 0000000000..0310f95629 --- /dev/null +++ b/src/Streamly/Internal/Unicode/Utf8/Reduce.hs @@ -0,0 +1,285 @@ +-- | +-- Module : Streamly.Internal.Unicode.Utf8.Reduce +-- Copyright : (c) 2021 Composewell Technologies +-- License : BSD-3-Clause +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- Portability : GHC +-- +module Streamly.Internal.Unicode.Utf8.Reduce + ( + -- * Substrings + + -- ** Breaking strings + splitAt + , breakOn + , breakOnEnd + , break + , span + , group + , groupBy + , inits + , tails + + -- ** Breaking into many substrings + -- $split + , splitOn + , split + , chunksOf + + -- * Searching + , breakOnAll + , partition + ) +where + +#include "inline.hs" + +-------------------------------------------------------------------------------- +-- Imports +-------------------------------------------------------------------------------- + +import System.IO.Unsafe (unsafePerformIO) + +import qualified Streamly.Internal.Data.Fold as Fold +import qualified Streamly.Internal.Data.Stream.IsStream as Stream + +import Streamly.Internal.Unicode.Utf8.Type + +import Prelude hiding (break, span, splitAt) + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8 + +-------------------------------------------------------------------------------- +-- Substrings +-------------------------------------------------------------------------------- + +-- | 'splitAt' @n t@ returns a pair whose first element is a +-- prefix of @t@ of length @n@, and whose second is the remainder of +-- the string. It is equivalent to @('take' n t, 'drop' n t)@. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +splitAt :: Int -> Utf8 -> (Utf8, Utf8) +splitAt = undefined + +-- | 'span', applied to a predicate @p@ and text @t@, returns +-- a pair whose first element is the longest prefix (possibly empty) +-- of @t@ of elements that satisfy @p@, and whose second is the +-- remainder of the list. +-- +-- >> Utf8.span (=='0') "000AB" +-- ("000","AB") +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE span #-} +span :: (Char -> Bool) -> Utf8 -> (Utf8, Utf8) +span = undefined + +-- | 'break' is like 'span', but the prefix returned is +-- over elements that fail the predicate @p@. +-- +-- >> Utf8.break (=='c') "180cm" +-- ("180","cm") +-- +-- /Time complexity:/ O(n) +{-# INLINE break #-} +break :: (Char -> Bool) -> Utf8 -> (Utf8, Utf8) +break p = span (not . p) + +-- | Group characters in a string according to a predicate. +-- +-- /Time complexity:/ O(n) +groupBy :: (Char -> Char -> Bool) -> Utf8 -> [Utf8] +groupBy p = unsafePerformIO . Stream.toList . Stream.groupsBy p write . toStream + +-- | Group characters in a string by equality. +-- +-- /Time complexity:/ O(n) +group :: Utf8 -> [Utf8] +group = groupBy (==) + +-- | Return all initial segments of the given 'Utf8', shortest +-- first. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +inits :: Utf8 -> [Utf8] +inits = undefined + +-- | Return all final segments of the given 'Utf8', longest +-- first. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +tails :: Utf8 -> [Utf8] +tails = undefined + +-- XXX Add time complexity after implementation +-- | Break a 'Utf8' into pieces separated by the first 'Utf8' +-- argument (which cannot be empty), consuming the delimiter. An empty +-- delimiter is invalid, and will cause an error to be raised. +-- +-- Examples: +-- +-- >> splitOn "\r\n" "a\r\nb\r\nd\r\ne" +-- ["a","b","d","e"] +-- +-- >> splitOn "aaa" "aaaXaaaXaaaXaaa" +-- ["","X","X","X",""] +-- +-- >> splitOn "x" "x" +-- ["",""] +-- +-- and +-- +-- > intercalate s . splitOn s == id +-- > splitOn (singleton c) == split (==c) +-- +-- (Note: the string @s@ to split on above cannot be empty.) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL splitOn #-} +splitOn :: Utf8 + -- ^ String to split on. If this string is empty, an error + -- will occur. + -> Utf8 + -- ^ Input text. + -> [Utf8] +splitOn = undefined +{- +splitOn pat src = + unsafePerformIO + $ Stream.toList $ Stream.splitOnSeq (toArray pat) write (stream src) +-} + +-- | Splits a 'Utf8' into components delimited by separators, +-- where the predicate returns True for a separator element. The +-- resulting components do not contain the separators. Two adjacent +-- separators result in an empty component in the output. eg. +-- +-- >>> split (=='a') "aabbaca" +-- ["","","bb","c",""] +-- +-- >>> split (=='a') "" +-- [""] +-- +-- /Time complexity:/ O(n) +{-# INLINE split #-} +split :: (Char -> Bool) -> Utf8 -> [Utf8] +split p t = + unsafePerformIO $ Stream.toList $ Stream.splitOn p write (toStream t) + +-- | Splits a 'Utf8' into components of length @k@. The last +-- element may be shorter than the other chunks, depending on the +-- length of the input. Examples: +-- +-- >>> chunksOf 3 "foobarbaz" +-- ["foo","bar","baz"] +-- +-- >>> chunksOf 4 "haskell.org" +-- ["hask","ell.","org"] +-- +-- /Time complexity:/ O(n) +{-# INLINE chunksOf #-} +chunksOf :: Int -> Utf8 -> [Utf8] +chunksOf k t = + unsafePerformIO $ Stream.toList $ Stream.chunksOf k write (toStream t) + +-- XXX Add time complexity after implementation +-- | Find the first instance of @needle@ (which must be +-- non-'null') in @haystack@. The first element of the returned tuple +-- is the prefix of @haystack@ before @needle@ is matched. The second +-- is the remainder of @haystack@, starting with the match. +-- +-- Examples: +-- +-- >> breakOn "::" "a::b::c" +-- ("a","::b::c") +-- +-- >> breakOn "/" "foobar" +-- ("foobar","") +-- +-- Laws: +-- +-- > append prefix match == haystack +-- > where (prefix, match) = breakOn needle haystack +-- +-- If you need to break a string by a substring repeatedly (e.g. you +-- want to break on every instance of a substring), use 'breakOnAll' +-- instead, as it has lower startup overhead. +-- +-- /Unimplemented/ +{-# INLINE breakOn #-} +breakOn :: Utf8 -> Utf8 -> (Utf8, Utf8) +breakOn = undefined + +-- XXX Change >> to >>> after implementation +-- | Similar to 'breakOn', but searches from the end of the +-- string. +-- +-- The first element of the returned tuple is the prefix of @haystack@ +-- up to and including the last match of @needle@. The second is the +-- remainder of @haystack@, following the match. +-- +-- >> breakOnEnd "::" "a::b::c" +-- ("a::b::","c") +-- +-- /Time complexity:/ O(n+m) +-- +-- /Unimplemented/ +{-# INLINE breakOnEnd #-} +breakOnEnd :: Utf8 -> Utf8 -> (Utf8, Utf8) +breakOnEnd = undefined + +-------------------------------------------------------------------------------- +-- Searching +-------------------------------------------------------------------------------- + +-- | The 'partition' function takes a predicate and a 'Utf8', +-- and returns the pair of 'Utf8's with elements which do and do not +-- satisfy the predicate, respectively; i.e. +-- +-- > partition p t == (filter p t, filter (not . p) t) +-- +-- /Time complexity:/ O(n) +{-# INLINE partition #-} +partition :: (Char -> Bool) -> Utf8 -> (Utf8, Utf8) +partition p = unsafePerformIO . Stream.fold partitionFold . toStream + + where + + partitionFold = Fold.tee (Fold.filter p write) (Fold.filter (not . p) write) + +-- XXX Add time complexity after implementation +-- XXX Change >> to >>> after implementation +-- | Find all non-overlapping instances of @needle@ in +-- @haystack@. Each element of the returned list consists of a pair: +-- +-- * The entire string prior to the /k/th match (i.e. the prefix) +-- +-- * The /k/th match, followed by the remainder of the string +-- +-- Examples: +-- +-- >> breakOnAll "::" "" +-- [] +-- +-- >> breakOnAll "/" "a/b/c/" +-- [("a","/b/c/"),("a/b","/c/"),("a/b/c","/")] +-- +-- The @needle@ parameter may not be empty. +-- +-- /Unimplemented/ +{-# INLINE breakOnAll #-} +breakOnAll :: Utf8 -- ^ @needle@ to search for + -> Utf8 -- ^ @haystack@ in which to search + -> [(Utf8, Utf8)] +breakOnAll = undefined diff --git a/src/Streamly/Internal/Unicode/Utf8/Transform.hs b/src/Streamly/Internal/Unicode/Utf8/Transform.hs new file mode 100644 index 0000000000..dd192297e5 --- /dev/null +++ b/src/Streamly/Internal/Unicode/Utf8/Transform.hs @@ -0,0 +1,584 @@ +-- | +-- Module : Streamly.Internal.Unicode.Utf8.Transform +-- Copyright : (c) 2021 Composewell Technologies +-- License : BSD-3-Clause +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- Portability : GHC +-- +module Streamly.Internal.Unicode.Utf8.Transform + ( + -- * Transformations + map + , intercalate + , intersperse + , transpose + , reverse + , replace + + -- ** Case conversion + -- $case + , toCaseFold + , toLower + , toTitle + , toUpper + + -- ** Justification + , justifyLeft + , justifyRight + , center + + -- * Construction + + -- ** Scans + , scanl + , scanl1 + , scanl' + , scanl1' + , scanr + , scanr1 + + -- * Searching + , filter + + -- * Substrings + + -- ** Breaking strings + , take + , takeEnd + , drop + , dropEnd + , takeWhile + , takeWhileEnd + , dropWhile + , dropWhileEnd + , dropAround + , strip + , stripStart + , stripEnd + ) +where + +#include "inline.hs" + +-------------------------------------------------------------------------------- +-- Imports +-------------------------------------------------------------------------------- + +import Data.Char (isSpace) + +import qualified Data.List as List +import qualified Prelude +import qualified Streamly.Internal.Data.Stream.IsStream as Stream + +import Streamly.Internal.Unicode.Utf8.Type + +import Prelude hiding + ( drop + , dropWhile + , filter + , map + , reverse + , scanl + , scanl1 + , scanr + , scanr1 + , take + , takeWhile + ) + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8 + +-------------------------------------------------------------------------------- +-- Transformations +-------------------------------------------------------------------------------- + +-- | 'map' @f@ @t@ is the 'Utf8' obtained by applying @f@ to +-- each element of @t@. +-- +-- Example: +-- +-- >>> let message = Utf8.pack "I am not angry. Not at all." +-- >>> Utf8.map (\c -> if c == '.' then '!' else c) message +-- "I am not angry! Not at all!" +-- +-- Performs replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +{-# INLINE map #-} +map :: (Char -> Char) -> Utf8 -> Utf8 +map f = fromStream . Stream.map f . toStream + +-- XXX Change >> to >>> after implementation +-- | The 'intercalate' function takes a 'Utf8' and a list of +-- 'Utf8's and concatenates the list after interspersing the first +-- argument between each element of the list. +-- +-- Example: +-- +-- >> Utf8.intercalate "NI!" ["We", "seek", "the", "Holy", "Grail"] +-- "WeNI!seekNI!theNI!HolyNI!Grail" +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE intercalate #-} +intercalate :: Utf8 -> [Utf8] -> Utf8 +intercalate = undefined + +-- XXX Change >> to >>> after implementation +-- | The 'intersperse' function takes a character and places it +-- between the characters of a 'Utf8'. +-- +-- Example: +-- +-- >> Utf8.intersperse '.' "SHIELD" +-- "S.H.I.E.L.D" +-- +-- Performs replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE intersperse #-} +intersperse :: Char -> Utf8 -> Utf8 +intersperse = undefined + +-- XXX Change >> to >>> after implementation +-- | Reverse the characters of a string. +-- +-- Example: +-- +-- >> Utf8.reverse "desrever" +-- "reversed" +-- +-- +-- /Time complexity:/ O(n) +{-# INLINE reverse #-} +reverse :: Utf8 -> Utf8 +reverse = fromStream . Stream.reverse . toStream + +-- XXX Add time complexity after implementation +-- XXX Change >> to >>> after implementation +-- | Replace every non-overlapping occurrence of @needle@ in +-- @haystack@ with @replacement@. +-- +-- This function behaves as though it was defined as follows: +-- +-- @ +-- replace needle replacement haystack = +-- 'intercalate' replacement ('splitOn' needle haystack) +-- @ +-- +-- As this suggests, each occurrence is replaced exactly once. So if +-- @needle@ occurs in @replacement@, that occurrence will /not/ itself +-- be replaced recursively: +-- +-- >> replace "oo" "foo" "oo" +-- "foo" +-- +-- In cases where several instances of @needle@ overlap, only the +-- first one will be replaced: +-- +-- >> replace "ofo" "bar" "ofofo" +-- "barfo" +-- +-- /Unimplemented/ +{-# INLINE replace #-} +replace :: Utf8 + -- ^ @needle@ to search for. If this string is empty, an + -- error will occur. + -> Utf8 + -- ^ @replacement@ to replace @needle@ with. + -> Utf8 + -- ^ @haystack@ in which to search. + -> Utf8 +replace = undefined + +-------------------------------------------------------------------------------- +-- Case conversions (folds) +-------------------------------------------------------------------------------- + +-- | Convert a string to folded case. +-- +-- This function is mainly useful for performing caseless (also known +-- as case insensitive) string comparisons. +-- +-- A string @x@ is a caseless match for a string @y@ if and only if: +-- +-- @toCaseFold x == toCaseFold y@ +-- +-- The result string may be longer than the input string, and may +-- differ from applying 'toLower' to the input string. For instance, +-- the Armenian small ligature \"ﬓ\" (men now, U+FB13) is case +-- folded to the sequence \"մ\" (men, U+0574) followed by +-- \"ն\" (now, U+0576), while the Greek \"µ\" (micro sign, +-- U+00B5) is case folded to \"μ\" (small letter mu, U+03BC) +-- instead of itself. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE toCaseFold #-} +toCaseFold :: Utf8 -> Utf8 +toCaseFold = undefined + +-- | Convert a string to lower case, using simple case +-- conversion. +-- +-- The result string may be longer than the input string. For +-- instance, \"İ\" (Latin capital letter I with dot above, +-- U+0130) maps to the sequence \"i\" (Latin small letter i, U+0069) +-- followed by \" ̇\" (combining dot above, U+0307). +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE toLower #-} +toLower :: Utf8 -> Utf8 +toLower = undefined + +-- | Convert a string to upper case, using simple case +-- conversion. +-- +-- The result string may be longer than the input string. For +-- instance, the German \"ß\" (eszett, U+00DF) maps to the +-- two-letter sequence \"SS\". +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE toUpper #-} +toUpper :: Utf8 -> Utf8 +toUpper = undefined + +-- | Convert a 'Utf8' to title case, using simple case +-- conversion. +-- +-- The first letter of the input is converted to title case, as is +-- every subsequent letter that immediately follows a non-letter. +-- Every letter that immediately follows another letter is converted +-- to lower case. +-- +-- The result string may be longer than the input string. For example, +-- the Latin small ligature fl (U+FB02) is converted to the +-- sequence Latin capital letter F (U+0046) followed by Latin small +-- letter l (U+006C). +-- +-- /Note/: this function does not take language or culture specific +-- rules into account. For instance, in English, different style +-- guides disagree on whether the book name \"The Hill of the Red +-- Fox\" is correctly title cased—but this function will +-- capitalize /every/ word. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE toTitle #-} +toTitle :: Utf8 -> Utf8 +toTitle = undefined + +-- XXX Change >> to >>> after implementation +-- | Left-justify a string to the given length, using the +-- specified fill character on the right. +-- Performs replacement on invalid scalar values. +-- +-- Examples: +-- +-- >> justifyLeft 7 'x' "foo" +-- "fooxxxx" +-- +-- >> justifyLeft 3 'x' "foobar" +-- "foobar" +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE justifyLeft #-} +justifyLeft :: Int -> Char -> Utf8 -> Utf8 +justifyLeft = undefined + +-- | Right-justify a string to the given length, using the +-- specified fill character on the left. Performs replacement on +-- invalid scalar values. +-- +-- Examples: +-- +-- >> justifyRight 7 'x' "bar" +-- "xxxxbar" +-- +-- >> justifyRight 3 'x' "foobar" +-- "foobar" +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE justifyRight #-} +justifyRight :: Int -> Char -> Utf8 -> Utf8 +justifyRight = undefined + +-- XXX Change >> to >>> after implementation +-- | Center a string to the given length, using the specified +-- fill character on either side. Performs replacement on invalid +-- scalar values. +-- +-- Examples: +-- +-- >> center 8 'x' "HS" +-- "xxxHSxxx" +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE center #-} +center :: Int -> Char -> Utf8 -> Utf8 +center = undefined + +-- | The 'transpose' function transposes the rows and columns +-- of its 'Utf8' argument. Note that this function uses 'pack', +-- 'unpack', and the list version of transpose, and is thus not very +-- efficient. +-- +-- Examples: +-- +-- >>> transpose ["green","orange"] +-- ["go","rr","ea","en","ng","e"] +-- +-- >>> transpose ["blue","red"] +-- ["br","le","ud","e"] +-- +-- /Time complexity:/ O(n) +{-# INLINE transpose #-} +transpose :: [Utf8] -> [Utf8] +transpose = Prelude.map pack . List.transpose . Prelude.map unpack + +-------------------------------------------------------------------------------- +-- Building 'Utf8's +-------------------------------------------------------------------------------- + +-- | 'scanl' is similar to 'foldl', but returns a list of +-- successive reduced values from the left. +-- Performs replacement on invalid scalar values. +-- +-- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...] +-- +-- Note that +-- +-- > last (scanl f z xs) == foldl f z xs. +-- +-- /Time complexity:/ O(n) +{-# INLINE scanl #-} +scanl :: (Char -> Char -> Char) -> Char -> Utf8 -> Utf8 +scanl f z t = fromStream (Stream.scanl' f z (toStream t)) + +-- | 'scanl1' is a variant of 'scanl' that has no starting +-- value argument. Performs replacement on invalid scalar values. +-- +-- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...] +-- +-- /Time complexity:/ O(n) +{-# INLINE scanl1 #-} +scanl1 :: (Char -> Char -> Char) -> Utf8 -> Utf8 +scanl1 f t = fromStream (Stream.scanl1' f (toStream t)) + +-- | 'scanl'' is similar to 'foldl'', but returns a list of +-- successive reduced values from the left. +-- Performs replacement on invalid scalar values. +-- +-- > scanl' f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...] +-- +-- Note that +-- +-- > last (scanl' f z xs) == foldl f z xs. +-- +-- /Time complexity:/ O(n) +{-# INLINE scanl' #-} +scanl' :: (Char -> Char -> Char) -> Char -> Utf8 -> Utf8 +scanl' f z t = fromStream (Stream.scanl' f z (toStream t)) + +-- | 'scanl1'' is a variant of 'scanl'' that has no starting +-- value argument. Performs replacement on invalid scalar values. +-- +-- > scanl1' f [x1, x2, ...] == [x1, x1 `f` x2, ...] +-- +-- /Time complexity:/ O(n) +{-# INLINE scanl1' #-} +scanl1' :: (Char -> Char -> Char) -> Utf8 -> Utf8 +scanl1' f t = fromStream (Stream.scanl1' f (toStream t)) + +-- | 'scanr' is the right-to-left dual of 'scanl'. Performs +-- replacement on invalid scalar values. +-- +-- > scanr f v == reverse . scanl (flip f) v . reverse +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE scanr #-} +scanr :: (Char -> Char -> Char) -> Char -> Utf8 -> Utf8 +scanr = undefined + +-- | 'scanr1' is a variant of 'scanr' that has no starting +-- value argument. Performs replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE scanr1 #-} +scanr1 :: (Char -> Char -> Char) -> Utf8 -> Utf8 +scanr1 = undefined + +-------------------------------------------------------------------------------- +-- Searching with a predicate +-------------------------------------------------------------------------------- + +-- | 'filter', applied to a predicate and a 'Utf8', +-- returns a 'Utf8' containing those characters that satisfy the +-- predicate. +-- +-- /Time complexity:/ O(n) +{-# INLINE filter #-} +filter :: (Char -> Bool) -> Utf8 -> Utf8 +filter p t = fromStream (Stream.filter p (toStream t)) + +-------------------------------------------------------------------------------- +-- Substrings +-------------------------------------------------------------------------------- + +-- | 'take' @n@, applied to a 'Utf8', returns the prefix of the +-- 'Utf8' of length @n@, or the 'Utf8' itself if @n@ is greater than +-- the length of the Utf8. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL take #-} +take :: Int -> Utf8 -> Utf8 +take n t = fromStream (Stream.take n (toStream t)) + +-- | 'takeEnd' @n@ @t@ returns the suffix remaining after +-- taking @n@ characters from the end of @t@. +-- +-- Examples: +-- +-- >> takeEnd 3 "foobar" +-- "bar" +-- +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL takeEnd #-} +takeEnd :: Int -> Utf8 -> Utf8 +takeEnd = undefined + +-- | 'drop' @n@, applied to a 'Utf8', returns the suffix of the +-- 'Utf8' after the first @n@ characters, or the empty 'Utf8' if @n@ +-- is greater than the length of the 'Utf8'. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL drop #-} +drop :: Int -> Utf8 -> Utf8 +drop n t = fromStream (Stream.drop n (toStream t)) + +-- | 'dropEnd' @n@ @t@ returns the prefix remaining after +-- dropping @n@ characters from the end of @t@. +-- +-- Examples: +-- +-- >> dropEnd 3 "foobar" +-- "foo" +-- +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL dropEnd #-} +dropEnd :: Int -> Utf8 -> Utf8 +dropEnd = undefined + +-- | 'takeWhile', applied to a predicate @p@ and a 'Utf8', +-- returns the longest prefix (possibly empty) of elements that +-- satisfy @p@. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL takeWhile #-} +takeWhile :: (Char -> Bool) -> Utf8 -> Utf8 +takeWhile p t = fromStream (Stream.takeWhile p (toStream t)) + +-- | 'takeWhileEnd', applied to a predicate @p@ and a 'Utf8', +-- returns the longest suffix (possibly empty) of elements that +-- satisfy @p@. +-- Examples: +-- +-- >> takeWhileEnd (=='o') "foo" +-- "oo" +-- +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL takeWhileEnd #-} +takeWhileEnd :: (Char -> Bool) -> Utf8 -> Utf8 +takeWhileEnd = undefined + +-- | 'dropWhile' @p@ @t@ returns the suffix remaining after +-- 'takeWhile' @p@ @t@. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL dropWhile #-} +dropWhile :: (Char -> Bool) -> Utf8 -> Utf8 +dropWhile p t = fromStream (Stream.dropWhile p (toStream t)) + +-- | 'dropWhileEnd' @p@ @t@ returns the prefix remaining after +-- dropping characters that satisfy the predicate @p@ from the end of +-- @t@. +-- +-- Examples: +-- +-- >> dropWhileEnd (=='.') "foo..." +-- "foo" +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL dropWhileEnd #-} +dropWhileEnd :: (Char -> Bool) -> Utf8 -> Utf8 +dropWhileEnd = undefined + +-- | 'dropAround' @p@ @t@ returns the substring remaining after +-- dropping characters that satisfy the predicate @p@ from both the +-- beginning and end of @t@. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL dropAround #-} +dropAround :: (Char -> Bool) -> Utf8 -> Utf8 +dropAround p = dropWhile p . dropWhileEnd p + +-- | Remove leading white space from a string. Equivalent to: +-- +-- > dropWhile isSpace +-- +-- /Time complexity:/ O(n) +{-# INLINE stripStart #-} +stripStart :: Utf8 -> Utf8 +stripStart = dropWhile isSpace + +-- | Remove trailing white space from a string. Equivalent to: +-- +-- > dropWhileEnd isSpace +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL stripEnd #-} +stripEnd :: Utf8 -> Utf8 +stripEnd = dropWhileEnd isSpace + +-- | Remove leading and trailing white space from a string. +-- Equivalent to: +-- +-- > dropAround isSpace +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL strip #-} +strip :: Utf8 -> Utf8 +strip = dropAround isSpace diff --git a/src/Streamly/Internal/Unicode/Utf8/Type.hs b/src/Streamly/Internal/Unicode/Utf8/Type.hs new file mode 100644 index 0000000000..540b37304e --- /dev/null +++ b/src/Streamly/Internal/Unicode/Utf8/Type.hs @@ -0,0 +1,422 @@ +-- | +-- Module : Streamly.Internal.Unicode.Utf8.Type +-- Copyright : (c) 2021 Composewell Technologies +-- License : BSD-3-Clause +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- Portability : GHC +-- +module Streamly.Internal.Unicode.Utf8.Type + ( + -- * Type + Utf8 + , toArray + + -- * Creation and elimination + , empty + , singleton + , toStream + , fromStream + , pack + , unpack + + -- * Basic interface + , cons + , snoc + , append + , uncons + , unsnoc + , head + , last + , tail + , init + , null + + , isSingleton + , length + , compareLength + + -- * Folds + , write + + -- * Unfolds + , read + ) +where + + +#include "inline.hs" + +-------------------------------------------------------------------------------- +-- Imports +-------------------------------------------------------------------------------- + +import Control.DeepSeq (NFData) +import Control.Monad.IO.Class (MonadIO) +import Data.Bifunctor (second) +import Data.Bits (shiftR, (.&.)) +import Data.Char (ord) +import Data.String (IsString(..)) +import Data.Word (Word8) +import GHC.Base (assert) +import GHC.IO.Encoding.Failure (isSurrogate) +import Streamly.Internal.Data.Array.Foreign.Type (Array) +import Streamly.Internal.Data.Fold (Fold) +import Streamly.Internal.Data.Stream.IsStream (SerialT) +import Streamly.Internal.Data.Unfold (Unfold) +import System.IO.Unsafe (unsafePerformIO) + +import qualified Streamly.Internal.Data.Array.Foreign as Array +import qualified Streamly.Internal.Data.Array.Foreign.Type as Array +import qualified Streamly.Internal.Data.Array.Foreign.Mut.Type as MArray +import qualified Streamly.Internal.Data.Fold.Type as Fold +import qualified Streamly.Internal.Data.Stream.IsStream as Stream +import qualified Streamly.Internal.Unicode.Stream as Unicode + +import Prelude hiding (head, init, last, length, null, read, tail) + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8 + +-------------------------------------------------------------------------------- +-- Type +-------------------------------------------------------------------------------- + +-- | A space efficient, packed, unboxed Unicode container. +newtype Utf8 = + Utf8 (Array Word8) + deriving (NFData) + +empty :: Utf8 +empty = Utf8 Array.nil + +-------------------------------------------------------------------------------- +-- Helpers +-------------------------------------------------------------------------------- + +{-# INLINE toStream #-} +toStream :: Monad m => Utf8 -> SerialT m Char +toStream = Unicode.decodeUtf8 . Array.toStream . toArray + +-- XXX Try removing IO +-- XXX This would require rewriting Array.Foreign.write-ish functions. We can't +-- use the underlying Array.Foreign.Mut.write-ish +{-# INLINE fromStream #-} +fromStream :: SerialT IO Char -> Utf8 +fromStream = + Utf8 . unsafePerformIO . Stream.fold Array.write . Unicode.encodeUtf8 + +-------------------------------------------------------------------------------- +-- Conversion to/from 'Utf8' +-------------------------------------------------------------------------------- + +-- | Convert a 'Utf8' to a 'Array Word8'. +-- +-- /Time complexity:/ O(1) +{-# INLINE toArray #-} +toArray :: Utf8 -> Array Word8 +toArray (Utf8 arr) = arr + +-- | Convert a 'String' into a 'Utf8'. Performs +-- replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL pack #-} +pack :: String -> Utf8 +pack = fromStream . Stream.fromList + +-- | Convert a 'Utf8' into a 'String'. +-- +-- /Time complexity:/ O(n) +{-# INLINE_NORMAL unpack #-} +unpack :: Utf8 -> String +unpack = unsafePerformIO . Stream.toList . toStream + +-------------------------------------------------------------------------------- +-- Instances +-------------------------------------------------------------------------------- + +instance IsString Utf8 where + {-# INLINE fromString #-} + fromString = pack + +instance Show Utf8 where + {-# INLINE showsPrec #-} + showsPrec p ps r = showsPrec p (unpack ps) r + +-------------------------------------------------------------------------------- +-- Streamly style APIs +-------------------------------------------------------------------------------- + +-- XXX From the review: +-- +-- This should be implemented as an Unfold m Char Word8 composed with the input +-- of Array.write. For that we would need to implement unfoldMany for folds: +-- +-- > unfoldMany :: Unfold m a b -> Fold m b c -> Fold m a c +-- +-- If we assume the argument fold to be a non-terminating then it should be easy +-- to implement. That is do not handle the done case, just error out in the done +-- case. +-- +-- Once we have that then we can use: +-- +-- > writeGeneric = Fold.unfoldMany readCharUtf8 A.write +-- +-- For readCharUtf8 see https://github.com/composewell/streamly/pull/1055/files +{-# INLINE write #-} +write :: forall m. MonadIO m => Fold m Char Utf8 +write = Fold.Fold step initial (return . Utf8 . Array.unsafeFreeze) + + where + + -- XXX Start of with some specific size? + initial = return $ Fold.Partial MArray.nil + + -- XXX snocExp over snoc? + step arr c = + case ord c of + x + | x <= 0x7F -> do + arr1 <- arr `MArray.snoc` fromIntegral x + return $ Fold.Partial arr1 + | x <= 0x7FF -> do + arr1 <- arr `snoc2` c + return $ Fold.Partial arr1 + | x <= 0xFFFF -> + if isSurrogate c + then Fold.Partial <$> snoc3_ arr 239 191 189 + else do + arr1 <- arr `snoc3` c + return $ Fold.Partial arr1 + | otherwise -> do + arr1 <- arr `snoc4` c + return $ Fold.Partial arr1 + + {-# INLINE snoc2 #-} + snoc2 :: MArray.Array Word8 -> Char -> m (MArray.Array Word8) + snoc2 arr c = + assert (n >= 0x80 && n <= 0x07ff) + $ do + arr1 <- arr `MArray.snoc` x1 + arr1 `MArray.snoc` x2 + + where + + n = ord c + x1 = fromIntegral $ (n `shiftR` 6) + 0xC0 + x2 = fromIntegral $ (n .&. 0x3F) + 0x80 + + {-# INLINE snoc3_ #-} + snoc3_ :: + MArray.Array Word8 + -> Word8 + -> Word8 + -> Word8 + -> m (MArray.Array Word8) + snoc3_ arr x1 x2 x3 = do + arr1 <- arr `MArray.snoc` x1 + arr2 <- arr1 `MArray.snoc` x2 + arr2 `MArray.snoc` x3 + + {-# INLINE snoc3 #-} + snoc3 :: MArray.Array Word8 -> Char -> m (MArray.Array Word8) + snoc3 arr c = assert (n >= 0x80 && n <= 0x07ff) (snoc3_ arr x1 x2 x3) + + where + + n = ord c + x1 = fromIntegral $ (n `shiftR` 12) + 0xE0 + x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 + x3 = fromIntegral $ (n .&. 0x3F) + 0x80 + + {-# INLINE snoc4 #-} + snoc4 :: MArray.Array Word8 -> Char -> m (MArray.Array Word8) + snoc4 arr c = + assert (n >= 0x80 && n <= 0x07ff) + $ do + arr1 <- arr `MArray.snoc` x1 + arr2 <- arr1 `MArray.snoc` x2 + arr3 <- arr2 `MArray.snoc` x3 + arr3 `MArray.snoc` x4 + + where + + n = ord c + x1 = fromIntegral $ (n `shiftR` 18) + 0xF0 + x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80 + x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 + x4 = fromIntegral $ (n .&. 0x3F) + 0x80 + +-- XXX Similar to the comment on write, this should be implemented in terms of +-- "foldMany" along with "writeCharUtf8" +-- +-- foldMany :: Fold m b c -> Unfold m a b -> Unfold m a c +-- writeCharUtf8 :: Fold m Word8 Char +-- +-- read = foldMany Unicode.writeCharUtf8 Array.read +-- +-- | Unfold a 'Utf8' into a stream of 'Char'. +-- +-- /Unimplemented/ +{-# INLINE read #-} +read :: Unfold m Utf8 Char +read = undefined + +-------------------------------------------------------------------------------- +-- Basic functions +-------------------------------------------------------------------------------- + +-- | Produce a singleton 'Utf8'. +-- +singleton :: Char -> Utf8 +singleton = fromStream . Stream.fromPure + +-- XXX From the review: +-- +-- StreamD cons would be better here. And we should have a caveat that this +-- function should be avoided to build a big array using this, so you should not +-- be using foldr cons empty that would suck with StreamD cons. But an operation +-- like x cons xs would work much better with StreamD cons compared to regular +-- cons. +-- +-- You can also memcpy if that turns out to be faster than stream. +-- +-- | Adds a character to the front of a 'Utf8'. This function is more +-- costly than its 'List' counterpart because it requires copying a new array. +-- Performs replacement on invalid scalar values. +-- +-- /Time complexity:/ O(n) +{-# INLINE cons #-} +cons :: Char -> Utf8 -> Utf8 +cons c = fromStream . Stream.cons c . toStream + +-- | Adds a character to the end of a 'Utf8'. Performs replacement +-- on invalid scalar values. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE snoc #-} +snoc :: Utf8 -> Char -> Utf8 +snoc = undefined + +-- | Appends one 'Utf8' to the other by copying both of them into a new +-- 'Utf8'. +-- +-- /Time complexity:/ O(n) +{-# NOINLINE append #-} +append :: Utf8 -> Utf8 -> Utf8 +append (Utf8 a) (Utf8 b) = Utf8 $ unsafePerformIO $ Array.splice a b + +-- | Returns the first character of a 'Utf8', or 'Nothing' if empty. +-- +-- /Time complexity:/ O(1) +{-# INLINE head #-} +head :: Utf8 -> Maybe Char +head = unsafePerformIO . Stream.head . toStream + +-- XXX From the review: +-- +-- We can use a length fold and a single char decoding fold in parallel on the +-- stream. Then we can use a array slice to get the tail of the array using the +-- length returned by the length fold. +-- +-- Alternatively, we could get the head char, find its encoded length and use +-- that to slice the array. +-- +-- We can use "writeCharUtf8" - once implemented - here. See the notes on +-- "writeCharUtf8'" in "Unicode.Stream". +-- +-- | Returns the first character and rest of a 'Utf8', or 'Nothing' if +-- empty. +-- +-- /Time complexity:/ O(1) +{-# INLINE_NORMAL uncons #-} +uncons :: Utf8 -> Maybe (Char, Utf8) +uncons = fmap (second fromStream) . unsafePerformIO . Stream.uncons . toStream + +-- | Returns the last character of a 'Utf8', or 'Nothing' if empty. +-- +-- /Time complexity:/ O(1) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL last #-} +last :: Utf8 -> Char +last = undefined + +-- | Returns all characters after the head of a 'Utf8', or 'Nothing' if +-- empty. +-- +-- /Time complexity:/ O(1) +{-# INLINE_NORMAL tail #-} +tail :: Utf8 -> Maybe Utf8 +tail = fmap snd . uncons + +-- XXX From the review +-- +-- If we can write a routine to decode utf8 in reverse then we can just decode +-- the last char from the end of the array and then slice it. +-- +-- Otherwise, use last on the stream, get the encoded length of the last char +-- and use that to slice it. +-- | Returns all but the last character of a 'Utf8', or 'Nothing' if +-- empty. +-- +-- /Time complexity:/ O(1) +{-# INLINE_NORMAL init #-} +init :: Utf8 -> Maybe Utf8 +init = fmap fromStream . unsafePerformIO . Stream.init . toStream + +-- | Returns all but the last character and the last character of a +-- 'Utf8', or 'Nothing' if empty. +-- +-- /Time complexity:/ O(1) +-- +-- /Unimplemented/ +{-# INLINE unsnoc #-} +unsnoc :: Utf8 -> Maybe (Utf8, Char) +unsnoc = undefined + +-- | Tests whether a 'Utf8' is empty or not. +-- +-- /Time complexity:/ O(1) +{-# INLINE null #-} +null :: Utf8 -> Bool +null = Array.null . toArray + +-- | Tests whether a 'Utf8' contains exactly one character. +-- +-- /Time complexity:/ O(1) +-- +-- /Unimplemented/ +{-# INLINE isSingleton #-} +isSingleton :: Utf8 -> Bool +isSingleton = undefined + +-- XXX From the review +-- +-- We could possibly determine the length faster by using a custom routine that +-- counts the starting chars from the utf8 encoded bytes without decoding the +-- chars. +-- +-- | Returns the number of characters in a 'Utf8'. +-- +-- /Time complexity:/ O(n) +{-# INLINE length #-} +length :: Utf8 -> Int +length = unsafePerformIO . Stream.length . toStream + +-- | Compare the count of characters in a 'Utf8' to a number. +-- +-- This function gives the same answer as comparing against the result of +-- 'length', but can short circuit if the count of characters is greater than +-- the number, and hence be more efficient. +-- +-- /Time complexity:/ O(n) +-- +-- /Unimplemented/ +{-# INLINE_NORMAL compareLength #-} +compareLength :: Utf8 -> Int -> Ordering +compareLength = undefined diff --git a/streamly.cabal b/streamly.cabal index 2d6dadeb14..2222c7aca1 100644 --- a/streamly.cabal +++ b/streamly.cabal @@ -507,6 +507,11 @@ library -- streamly-unicode , Streamly.Internal.Unicode.Stream + , Streamly.Internal.Unicode.Utf8.Type + , Streamly.Internal.Unicode.Utf8.Transform + , Streamly.Internal.Unicode.Utf8.Eliminate + , Streamly.Internal.Unicode.Utf8.Generate + , Streamly.Internal.Unicode.Utf8.Reduce , Streamly.Internal.Unicode.Utf8 , Streamly.Internal.Unicode.Char , Streamly.Internal.Unicode.Char.Parser