Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend the behaviour of Utf8 to replicate that of strict Text #1386

Draft
wants to merge 26 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2cb75c3
Export nil in immutable foreign array modules
adithyaov Dec 22, 2021
56a045a
Extend the behaviour of Utf8 to replicate that of strict Text
adithyaov Dec 23, 2021
d8a5a76
Split Unicode.Utf8 into Unicode.Type and Unicode.Transform
adithyaov Dec 28, 2021
bd4c051
Break out Eliminate, Generate, and Reduce from Unicode.Utf8
adithyaov Dec 29, 2021
95fe359
Remove specific exports and export entire modules instead
adithyaov Dec 29, 2021
d606c34
Comment out isInfixOf as it is memory intensive
adithyaov Dec 29, 2021
25b84fc
Add an IsString and a Show instance for Utf8
adithyaov Dec 29, 2021
a419c5c
Implement the basic write fold for Utf8
adithyaov Dec 30, 2021
a5492cd
Disable unimplemented docspec examples
adithyaov Dec 30, 2021
2cb9aa1
Comment out a few APIs to help with the compilation on GHCJS
adithyaov Dec 31, 2021
119633a
Rewrite time complexity in a different way
adithyaov Jan 6, 2022
4c467cc
Rearrange function order in "defined before use" in Utf8.Type
adithyaov Jan 7, 2022
e71127a
Rename stream and unstream to toStream and fromStream
adithyaov Jan 7, 2022
426a52d
Make minor changes and encode the review comments on Utf8.Type
adithyaov Jan 7, 2022
74e1280
Add a rewiew comment about adding general array combinators
adithyaov Jan 18, 2022
464a444
Cut short the functions hiding from Prelude in Utf8.*
adithyaov Jan 18, 2022
9c0cde5
Add "Unimplemented" tag to unimplemented functions in Utf8.*
adithyaov Jan 18, 2022
eb3d9f1
Remove wrong/unecessary implementation
adithyaov Jan 18, 2022
7ecb533
Improve Utf8.partition
adithyaov Jan 18, 2022
d8a0897
Use Bifunctor.second & remove redundant commented code 4rm Utf8.Type
adithyaov Jan 18, 2022
9bce2f5
Improve Utf8.singleton
adithyaov Jan 18, 2022
98e4400
Remove ambiguious time complexity specified
adithyaov Jan 18, 2022
6af34fa
Add some comments regarding removing MonadIO in Utf8
adithyaov Jan 18, 2022
07c248a
Give credit to the text package in Utf8
adithyaov Jan 18, 2022
a651995
Perform replacement of invalid code points instead of erroring
adithyaov Feb 1, 2022
09c7288
General cleanup and fixes
adithyaov Feb 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Streamly/Internal/Data/Array/Foreign.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ module Streamly.Internal.Data.Array.Foreign
Array

-- * Construction
, A.nil

-- Pure, From Static Memory (Unsafe)
-- We can use fromPtrM#, fromCStringM# and fromAddrM# to create arrays from
Expand Down
1 change: 1 addition & 0 deletions src/Streamly/Internal/Data/Array/Foreign/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ module Streamly.Internal.Data.Array.Foreign.Type
, unsafeThaw

-- * Construction
, nil
, splice

, fromPtr
Expand Down
164 changes: 131 additions & 33 deletions src/Streamly/Internal/Unicode/Utf8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,62 +6,160 @@
-- Stability : experimental
-- Portability : GHC
--
-- This module mimics the API of the text package. Some documentation snippets
-- may have been taken from the text package.
module Streamly.Internal.Unicode.Utf8
(
-- * Type
Utf8

-- * Creation and elimination
, pack
, unpack
, toArray
module Streamly.Internal.Unicode.Utf8.Type
, module Streamly.Internal.Unicode.Utf8.Transform
, module Streamly.Internal.Unicode.Utf8.Eliminate
, module Streamly.Internal.Unicode.Utf8.Generate
, module Streamly.Internal.Unicode.Utf8.Reduce

-- * Folds

-- ** Special folds
, concat
, concatMap

-- * Substrings

-- ** Breaking into lines and words
, lines
--, lines'
, words
, unlines
, unwords

-- * Zipping
, zip
, zipWith

-- -* Ordered
-- , sort

-- -- * Low level operations
-- , copy
-- , unpackCString#

)
where

#include "inline.hs"

--------------------------------------------------------------------------------
-- Imports
--------------------------------------------------------------------------------

import Control.DeepSeq (NFData)
import Data.Word (Word8)
import Streamly.Internal.Data.Array.Foreign.Type (Array)
import Data.Char (isSpace)
import System.IO.Unsafe (unsafePerformIO)

import qualified Streamly.Internal.Data.Array.Foreign as Array
import qualified Data.List as List
import qualified Streamly.Internal.Data.Stream.IsStream as Stream
import qualified Streamly.Internal.Unicode.Stream as Unicode

import Streamly.Internal.Unicode.Utf8.Type
import Streamly.Internal.Unicode.Utf8.Transform
import Streamly.Internal.Unicode.Utf8.Eliminate
import Streamly.Internal.Unicode.Utf8.Generate
import Streamly.Internal.Unicode.Utf8.Reduce

import Prelude hiding
( concat
, concatMap
, foldr
, lines
, null
, unlines
, unwords
, words
, zip
, zipWith
)

-- $setup
-- >>> :set -XOverloadedStrings
-- >>> import qualified Streamly.Internal.Unicode.Utf8 as Utf8

--------------------------------------------------------------------------------
-- Type
-- Special folds
--------------------------------------------------------------------------------

-- | A space efficient, packed, unboxed Unicode container.
newtype Utf8 =
Utf8 (Array Word8)
deriving (NFData)
-- XXX We should write these APIs generalized on Array a and then just use those
-- for the Utf8 type. The generalized APIs would be more useful, they can go in
-- the Array module itself and can be used generally for arrays, you won't need
-- to transform arrays into stream and then back for such common operations.
-- | Concatenate a list of 'Utf8's.
--
-- /Time complexity:/ O(n)
{-# INLINE concat #-}
concat :: [Utf8] -> Utf8
concat ts =
case Prelude.filter (not . null) ts of
[] -> empty
[t] -> t
xs -> Prelude.foldl1 append xs
adithyaov marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should use something like this:

concat m = fromStreamD $ D.unfoldMany A.read (toStreamD m)


-- | Map a function over a 'Utf8' that results in a 'Utf8', and
-- concatenate the results.
--
-- /Time complexity:/ O(n)
{-# INLINE concatMap #-}
concatMap :: (Char -> Utf8) -> Utf8 -> Utf8
concatMap f = concat . foldr ((:) . f) []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to use stream based implementation instead of using list.


--------------------------------------------------------------------------------
-- Functions
-- Zipping
--------------------------------------------------------------------------------

{-# INLINE toArray #-}
toArray :: Utf8 -> Array Word8
toArray (Utf8 arr) = arr
-- | 'zip' takes two 'Utf8's and returns a list of
-- corresponding pairs of bytes. If one input 'Utf8' is short,
-- excess elements of the longer 'Utf8' are discarded. This is
-- equivalent to a pair of 'unpack' operations.
--
-- /Time complexity:/ O(n)
{-# INLINE zip #-}
zip :: Utf8 -> Utf8 -> [(Char,Char)]
zip a b =
unsafePerformIO
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use identity monad instead of unsafePerformIO.

$ Stream.toList $ Stream.zipWith (,) (toStream a) (toStream b)

-- | 'zipWith' generalises 'zip' by zipping with the function
-- given as the first argument, instead of a tupling function.
-- Performs replacement on invalid scalar values.
--
-- /Time complexity:/ O(n)
{-# INLINE zipWith #-}
zipWith :: (Char -> Char -> Char) -> Utf8 -> Utf8 -> Utf8
zipWith f a b = fromStream (Stream.zipWith f (toStream a) (toStream b))

{-# INLINEABLE pack #-}
pack :: String -> Utf8
pack s =
Utf8
$ unsafePerformIO
$ Array.fromStreamN len $ Unicode.encodeUtf8' $ Stream.fromList s
-- | Breaks a 'Utf8' up into a list of words, delimited by 'Char's
-- representing white space.
--
-- /Time complexity:/ O(n)
{-# INLINE words #-}
words :: Utf8 -> [Utf8]
words = split isSpace

where
-- | Breaks a 'Utf8' up into a list of 'Utf8's at
-- newline 'Char's. The resulting strings do not contain newlines.
--
-- /Time complexity:/ O(n)
{-# INLINE lines #-}
lines :: Utf8 -> [Utf8]
lines = split (== '\n')

len = length s
-- | Joins lines, after appending a terminating newline to
-- each.
--
-- /Time complexity:/ O(n)
{-# INLINE unlines #-}
unlines :: [Utf8] -> Utf8
unlines = concat . List.map (`snoc` '\n')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using snoc may not be most efficient, we should implement it in a streaming way, see unlines in Unicode.Stream module.


{-# INLINEABLE unpack #-}
unpack :: Utf8 -> String
unpack u =
unsafePerformIO
$ Stream.toList $ Unicode.decodeUtf8' $ Array.toStream $ toArray u
-- | Joins words using single space characters.
--
-- /Time complexity:/ O(n)
{-# INLINE unwords #-}
unwords :: [Utf8] -> Utf8
unwords = intercalate (singleton ' ')
Loading