Permalink
Browse files

refactor underway to support hyph-utf8 data files

  • Loading branch information...
1 parent 0b86ead commit ef1c3b987f286eeca4de998f728d68c075f4e1d1 @ekmett committed Feb 11, 2012
Showing with 298,375 additions and 275 deletions.
  1. +1 −1 LICENSE
  2. +38 −132 Text/Hyphenation.hs
  3. +58 −0 Text/Hyphenation/Exception.hs
  4. +365 −0 Text/Hyphenation/Language.hs
  5. +81 −0 Text/Hyphenation/Pattern.hs
  6. +39 −0 data/hyph-af.chr.txt
  7. +1 −0 data/hyph-af.hyp.txt
  8. +26 −0 data/hyph-af.lic.txt
  9. +10,349 −0 data/hyph-af.pat.txt
  10. +72 −0 data/hyph-as.chr.txt
  11. 0 data/hyph-as.hyp.txt
  12. +25 −0 data/hyph-as.lic.txt
  13. +72 −0 data/hyph-as.pat.txt
  14. +30 −0 data/hyph-bg.chr.txt
  15. 0 data/hyph-bg.hyp.txt
  16. +45 −0 data/hyph-bg.lic.txt
  17. +1,660 −0 data/hyph-bg.pat.txt
  18. +72 −0 data/hyph-bn.chr.txt
  19. 0 data/hyph-bn.hyp.txt
  20. +25 −0 data/hyph-bn.lic.txt
  21. +72 −0 data/hyph-bn.pat.txt
  22. +34 −0 data/hyph-ca.chr.txt
  23. +26 −0 data/hyph-ca.hyp.txt
  24. +127 −0 data/hyph-ca.lic.txt
  25. +869 −0 data/hyph-ca.pat.txt
  26. +34 −0 data/hyph-cop.chr.txt
  27. 0 data/hyph-cop.hyp.txt
  28. +157 −0 data/hyph-cop.lic.txt
  29. +627 −0 data/hyph-cop.pat.txt
  30. +41 −0 data/hyph-cs.chr.txt
  31. +5 −0 data/hyph-cs.hyp.txt
  32. +87 −0 data/hyph-cs.lic.txt
  33. +3,636 −0 data/hyph-cs.pat.txt
  34. +27 −0 data/hyph-cy.chr.txt
  35. 0 data/hyph-cy.hyp.txt
  36. +29 −0 data/hyph-cy.lic.txt
  37. +6,728 −0 data/hyph-cy.pat.txt
  38. +29 −0 data/hyph-da.chr.txt
  39. 0 data/hyph-da.hyp.txt
  40. +79 −0 data/hyph-da.lic.txt
  41. +1,144 −0 data/hyph-da.pat.txt
  42. +42 −0 data/hyph-de-1901.chr.txt
  43. 0 data/hyph-de-1901.hyp.txt
  44. +34 −0 data/hyph-de-1901.lic.txt
  45. +14,429 −0 data/hyph-de-1901.pat.txt
  46. +42 −0 data/hyph-de-1996.chr.txt
  47. 0 data/hyph-de-1996.hyp.txt
  48. +34 −0 data/hyph-de-1996.lic.txt
  49. +14,263 −0 data/hyph-de-1996.pat.txt
  50. +42 −0 data/hyph-de-ch-1901.chr.txt
  51. 0 data/hyph-de-ch-1901.hyp.txt
  52. +35 −0 data/hyph-de-ch-1901.lic.txt
  53. +14,070 −0 data/hyph-de-ch-1901.pat.txt
  54. +49 −0 data/hyph-el-monoton.chr.txt
  55. 0 data/hyph-el-monoton.hyp.txt
  56. +39 −0 data/hyph-el-monoton.lic.txt
  57. +573 −0 data/hyph-el-monoton.pat.txt
  58. +157 −0 data/hyph-el-polyton.chr.txt
  59. 0 data/hyph-el-polyton.hyp.txt
  60. +40 −0 data/hyph-el-polyton.lic.txt
  61. +1,208 −0 data/hyph-el-polyton.pat.txt
  62. +26 −0 data/hyph-en-gb.chr.txt
  63. +8 −0 data/hyph-en-gb.hyp.txt
  64. +62 −0 data/hyph-en-gb.lic.txt
  65. +8,527 −0 data/hyph-en-gb.pat.txt
  66. +26 −0 data/hyph-en-us.chr.txt
  67. +14 −0 data/hyph-en-us.hyp.txt
  68. +42 −0 data/hyph-en-us.lic.txt
  69. +0 −19 data/{en.hyp → hyph-en-us.pat.txt}
  70. +28 −0 data/hyph-eo.chr.txt
  71. 0 data/hyph-eo.hyp.txt
  72. +35 −0 data/hyph-eo.lic.txt
  73. +2,203 −0 data/hyph-eo.pat.txt
  74. +32 −0 data/hyph-es.chr.txt
  75. 0 data/hyph-es.hyp.txt
  76. +28 −0 data/hyph-es.lic.txt
  77. +3,371 −0 data/hyph-es.pat.txt
  78. +27 −0 data/hyph-et.chr.txt
  79. 0 data/hyph-et.hyp.txt
  80. +82 −0 data/hyph-et.lic.txt
  81. +3,691 −0 data/hyph-et.pat.txt
  82. +26 −0 data/hyph-eu.chr.txt
  83. 0 data/hyph-eu.hyp.txt
  84. +47 −0 data/hyph-eu.lic.txt
  85. +192 −0 data/hyph-eu.pat.txt
  86. +25 −0 data/hyph-fi.chr.txt
  87. 0 data/hyph-fi.hyp.txt
  88. +80 −0 data/hyph-fi.lic.txt
  89. +286 −0 data/hyph-fi.pat.txt
  90. +39 −0 data/hyph-fr.chr.txt
  91. 0 data/hyph-fr.hyp.txt
  92. +108 −0 data/hyph-fr.lic.txt
  93. +144 −107 data/{fr.hyp → hyph-fr.pat.txt}
  94. +28 −0 data/hyph-ga.chr.txt
  95. +46 −0 data/hyph-ga.hyp.txt
  96. +102 −0 data/hyph-ga.lic.txt
  97. +6,033 −0 data/hyph-ga.pat.txt
  98. +31 −0 data/hyph-gl.chr.txt
  99. 0 data/hyph-gl.hyp.txt
  100. +23 −0 data/hyph-gl.lic.txt
  101. +2,321 −0 data/hyph-gl.pat.txt
  102. +157 −0 data/hyph-grc.chr.txt
  103. 0 data/hyph-grc.hyp.txt
  104. +44 −0 data/hyph-grc.lic.txt
  105. +4,296 −0 data/hyph-grc.pat.txt
  106. +66 −0 data/hyph-gu.chr.txt
  107. 0 data/hyph-gu.hyp.txt
  108. +25 −0 data/hyph-gu.lic.txt
  109. +66 −0 data/hyph-gu.pat.txt
  110. +70 −0 data/hyph-hi.chr.txt
  111. 0 data/hyph-hi.hyp.txt
  112. +25 −0 data/hyph-hi.lic.txt
  113. +70 −0 data/hyph-hi.pat.txt
  114. +27 −0 data/hyph-hr.chr.txt
  115. 0 data/hyph-hr.hyp.txt
  116. +22 −0 data/hyph-hr.lic.txt
  117. +1,475 −0 data/hyph-hr.pat.txt
  118. +33 −0 data/hyph-hsb.chr.txt
  119. +34 −0 data/hyph-hsb.hyp.txt
  120. +86 −0 data/hyph-hsb.lic.txt
  121. +1,516 −0 data/hyph-hsb.pat.txt
  122. +36 −0 data/hyph-hu.chr.txt
  123. 0 data/hyph-hu.hyp.txt
  124. +48 −0 data/hyph-hu.lic.txt
  125. +62,851 −0 data/hyph-hu.pat.txt
  126. +37 −0 data/hyph-hy.chr.txt
  127. 0 data/hyph-hy.hyp.txt
  128. +24 −0 data/hyph-hy.lic.txt
  129. +1,428 −0 data/hyph-hy.pat.txt
  130. +26 −0 data/hyph-ia.chr.txt
  131. +10 −0 data/hyph-ia.hyp.txt
  132. +95 −0 data/hyph-ia.lic.txt
  133. +638 −0 data/hyph-ia.pat.txt
  134. +25 −0 data/hyph-id.chr.txt
  135. +55 −0 data/hyph-id.hyp.txt
  136. +114 −0 data/hyph-id.lic.txt
  137. +143 −0 data/hyph-id.pat.txt
  138. +36 −0 data/hyph-is.chr.txt
  139. 0 data/hyph-is.hyp.txt
  140. +81 −0 data/hyph-is.lic.txt
  141. +2 −8 data/{is.hyp → hyph-is.pat.txt}
  142. +28 −0 data/hyph-it.chr.txt
  143. 0 data/hyph-it.hyp.txt
  144. +46 −0 data/hyph-it.lic.txt
  145. +377 −0 data/hyph-it.pat.txt
  146. +31 −0 data/hyph-kmr.chr.txt
  147. 0 data/hyph-kmr.hyp.txt
  148. +34 −0 data/hyph-kmr.lic.txt
  149. +307 −0 data/hyph-kmr.pat.txt
  150. +73 −0 data/hyph-kn.chr.txt
  151. 0 data/hyph-kn.hyp.txt
  152. +25 −0 data/hyph-kn.lic.txt
  153. +73 −0 data/hyph-kn.pat.txt
  154. +26 −0 data/hyph-la.chr.txt
  155. 0 data/hyph-la.hyp.txt
  156. +108 −0 data/hyph-la.lic.txt
  157. +335 −0 data/hyph-la.pat.txt
  158. +49 −0 data/hyph-lo.chr.txt
  159. 0 data/hyph-lo.hyp.txt
  160. +20 −0 data/hyph-lo.lic.txt
  161. +994 −0 data/hyph-lo.pat.txt
  162. +33 −0 data/hyph-lt.chr.txt
  163. 0 data/hyph-lt.hyp.txt
  164. +49 −0 data/hyph-lt.lic.txt
  165. +1,546 −0 data/hyph-lt.pat.txt
  166. +33 −0 data/hyph-lv.chr.txt
  167. 0 data/hyph-lv.hyp.txt
  168. +17 −0 data/hyph-lv.lic.txt
  169. +11,583 −0 data/hyph-lv.pat.txt
  170. +76 −0 data/hyph-ml.chr.txt
  171. 0 data/hyph-ml.hyp.txt
  172. +25 −0 data/hyph-ml.lic.txt
  173. +88 −0 data/hyph-ml.pat.txt
  174. +35 −0 data/hyph-mn-cyrl.chr.txt
  175. 0 data/hyph-mn-cyrl.hyp.txt
  176. +88 −0 data/hyph-mn-cyrl.lic.txt
  177. +988 −0 data/hyph-mn-cyrl.pat.txt
  178. +70 −0 data/hyph-mr.chr.txt
  179. 0 data/hyph-mr.hyp.txt
  180. +25 −0 data/hyph-mr.lic.txt
  181. +70 −0 data/hyph-mr.pat.txt
  182. +423 −0 data/hyph-mul-ethi.chr.txt
  183. 0 data/hyph-mul-ethi.hyp.txt
  184. +21 −0 data/hyph-mul-ethi.lic.txt
  185. +423 −0 data/hyph-mul-ethi.pat.txt
  186. +35 −0 data/hyph-nb.chr.txt
  187. +2 −0 data/hyph-nb.hyp.txt
  188. +37 −0 data/hyph-nb.lic.txt
  189. +27,148 −0 data/hyph-nb.pat.txt
  190. +38 −0 data/hyph-nl.chr.txt
  191. +40 −0 data/hyph-nl.hyp.txt
  192. +100 −0 data/hyph-nl.lic.txt
  193. +12,724 −0 data/hyph-nl.pat.txt
  194. +35 −0 data/hyph-nn.chr.txt
  195. +2 −0 data/hyph-nn.hyp.txt
  196. +37 −0 data/hyph-nn.lic.txt
  197. +27,148 −0 data/hyph-nn.pat.txt
  198. +65 −0 data/hyph-or.chr.txt
  199. 0 data/hyph-or.hyp.txt
  200. +25 −0 data/hyph-or.lic.txt
  201. +65 −0 data/hyph-or.pat.txt
  202. +60 −0 data/hyph-pa.chr.txt
  203. 0 data/hyph-pa.hyp.txt
  204. +25 −0 data/hyph-pa.lic.txt
  205. +60 −0 data/hyph-pa.pat.txt
  206. +34 −0 data/hyph-pl.chr.txt
  207. +20 −0 data/hyph-pl.hyp.txt
  208. +91 −0 data/hyph-pl.lic.txt
  209. +4,053 −0 data/hyph-pl.pat.txt
  210. +37 −0 data/hyph-pt.chr.txt
  211. +2 −0 data/hyph-pt.hyp.txt
  212. +88 −0 data/hyph-pt.lic.txt
  213. +307 −0 data/hyph-pt.pat.txt
  214. +27 −0 data/hyph-ro.chr.txt
  215. 0 data/hyph-ro.hyp.txt
  216. +112 −0 data/hyph-ro.lic.txt
  217. +647 −0 data/hyph-ro.pat.txt
  218. +34 −0 data/hyph-ru.chr.txt
  219. +184 −0 data/hyph-ru.hyp.txt
  220. +79 −0 data/hyph-ru.lic.txt
  221. +7,021 −0 data/hyph-ru.pat.txt
  222. +483 −0 data/hyph-sa.chr.txt
  223. 0 data/hyph-sa.hyp.txt
  224. +20 −0 data/hyph-sa.lic.txt
  225. +778 −0 data/hyph-sa.pat.txt
  226. +30 −0 data/hyph-sh-cyrl.chr.txt
  227. +116 −0 data/hyph-sh-cyrl.hyp.txt
  228. +61 −0 data/hyph-sh-cyrl.lic.txt
  229. +2,652 −0 data/hyph-sh-cyrl.pat.txt
  230. +27 −0 data/hyph-sh-latn.chr.txt
  231. +116 −0 data/hyph-sh-latn.hyp.txt
  232. +61 −0 data/hyph-sh-latn.lic.txt
  233. +2,669 −0 data/hyph-sh-latn.pat.txt
  234. +41 −0 data/hyph-sk.chr.txt
  235. +5 −0 data/hyph-sk.hyp.txt
  236. +85 −0 data/hyph-sk.lic.txt
  237. +2,467 −0 data/hyph-sk.pat.txt
  238. +29 −0 data/hyph-sl.chr.txt
  239. 0 data/hyph-sl.hyp.txt
  240. +96 −0 data/hyph-sl.lic.txt
  241. +1,068 −0 data/hyph-sl.pat.txt
  242. +30 −0 data/hyph-sr-cyrl.chr.txt
  243. +130 −0 data/hyph-sr-cyrl.hyp.txt
  244. +82 −0 data/hyph-sr-cyrl.lic.txt
  245. +2,425 −0 data/hyph-sr-cyrl.pat.txt
  246. +30 −0 data/hyph-sv.chr.txt
  247. 0 data/hyph-sv.hyp.txt
  248. +114 −0 data/hyph-sv.lic.txt
  249. +4,693 −0 data/hyph-sv.pat.txt
  250. +51 −0 data/hyph-ta.chr.txt
  251. 0 data/hyph-ta.hyp.txt
  252. +25 −0 data/hyph-ta.lic.txt
  253. +71 −0 data/hyph-ta.pat.txt
  254. +72 −0 data/hyph-te.chr.txt
  255. 0 data/hyph-te.hyp.txt
  256. +25 −0 data/hyph-te.lic.txt
  257. +72 −0 data/hyph-te.pat.txt
  258. +31 −0 data/hyph-tk.chr.txt
  259. 0 data/hyph-tk.hyp.txt
  260. +20 −0 data/hyph-tk.lic.txt
  261. +2,372 −0 data/hyph-tk.pat.txt
  262. +32 −0 data/hyph-tr.chr.txt
  263. 0 data/hyph-tr.hyp.txt
  264. +38 −0 data/hyph-tr.lic.txt
  265. +597 −0 data/hyph-tr.pat.txt
  266. +36 −0 data/hyph-uk.chr.txt
  267. 0 data/hyph-uk.hyp.txt
  268. +72 −0 data/hyph-uk.lic.txt
  269. +4,565 −0 data/hyph-uk.pat.txt
  270. +28 −0 data/hyph-zh-latn-pinyin.chr.txt
  271. 0 data/hyph-zh-latn-pinyin.hyp.txt
  272. +90 −0 data/hyph-zh-latn-pinyin.lic.txt
  273. +194 −0 data/hyph-zh-latn-pinyin.pat.txt
  274. +14 −8 hyphenation.cabal
View
@@ -1,4 +1,4 @@
-Copyright 2011 Edward Kmett
+Copyright 2012 Edward Kmett
All rights reserved.
View
@@ -15,138 +15,44 @@
-- and simplified to remove the need for a manual exception list.
----------------------------------------------------------------------------
module Text.Hyphenation
- (
+ ( Hyphenator(..)
+ , hyphenationScore
-- * Hyphenate with a given set of patterns
- hyphenate
- -- * Pattern file support
- , readHyphenationPatternFile
- -- ** Loading installed patterns
- , hyphenateLanguage
- -- ** Known patterns
- , hyphenateEnglish
- , hyphenateFrench
- , hyphenateIcelandic
+ , hyphenate
+ , defaultLeftMin
+ , defaultRightMin
) where
-import Control.Monad (forM_)
-import qualified Data.Vector.Unboxed as V
-import qualified Data.Vector.Unboxed.Mutable as MV
-import qualified Data.IntMap as IM
-import Data.Char (isSpace, toLower)
-import Paths_hyphenation
-import System.IO.Unsafe
-
-data Trie = Trie [Int] (IM.IntMap Trie)
-
-insert :: String -> Trie -> Trie
-insert s0 = go (chars s0) where
- go [] (Trie _ m) = Trie (points s0) m
- go (x:xs) (Trie n m) = Trie n (IM.insertWith (\_ -> go xs) (fromEnum x) (mk xs) m)
-
- mk [] = Trie (points s0) IM.empty
- mk (x:xs) = Trie [] (IM.singleton (fromEnum x) (mk xs))
-
-points :: String -> [Int]
-points (x:yzs@(_:zs))
- | x >= '0' && x <= '9' = (fromEnum x - fromEnum '0') : points zs
- | otherwise = 0 : points yzs
-points [x] | x >= '0' && x <= '9' = [fromEnum x - fromEnum '0']
- | otherwise = [0,0]
-points [] = [0]
-
-chars :: String -> String
-chars = filter (\x -> (x < '0' || x > '9'))
-
--- | Builds a hyphenator given a character normalization function
--- and a list of patterns.
---
--- Designed to be used partially applied to all but the last argument
--- The resulting function can be used to break a word up into fragments
--- where it would be legal to hyphenate the text.
---
--- The Knuth-Liang hyphenation algorithm isn't designed to find all
--- such points, but it does find most of them, and in particular tries
--- avoids ones where the hyphenation varies depending on the use of the
--- word as, for instance either a noun or a verb.
---
--- > do en <- hyphenate toLower <$> readHyphenationPatternFile "en.hyp"
--- > return $ en "hyphenation"
---
--- > ["hy","phen","ation"]
-hyphenate :: (Char -> Char) -> [String]
- -> String -> [String]
-hyphenate nf patterns = check where
- tree = foldr insert (Trie [] IM.empty) patterns
- check word
- | n <= 4 = [word]
- | otherwise = process [] word $ V.toList $ V.create $ do
- pts <- MV.replicate (n + 3) 0
- forM_ [0..n-1] $ walk pts tree
- MV.write pts 1 0
- MV.write pts 2 0
- MV.write pts n 0
- MV.write pts (n + 1) 0
- return $ MV.slice 2 n pts
- where
- process :: String -> String -> [Int] -> [String]
- process acc (w:ws) (p:ps)
- | odd p = reverse (w:acc) : process [] ws ps
- | otherwise = process (w:acc) ws ps
- process acc [] [] = [reverse acc]
- process _ _ _ = error "hyphenate: the impossible happened"
- ls = map nf word
- work = V.fromList ('.' : ls ++ ".")
- n = length word
- walk allpts t i = step (V.toList (V.drop i work)) t
- where
- step (x:xs) (Trie _ m) = case IM.lookup (fromEnum x) m of
- Just t'@(Trie ps' _) -> do
- put i ps'
- step xs t'
- Nothing -> return ()
- step [] _ = return ()
- put j _ | j `seq` False = undefined
- put _ [] = return ()
- put j (x:xs) = do
- y <- MV.read allpts j
- MV.write allpts j $ max x y
- put (j + 1) xs
-
-content :: String -> Bool
-content (x:xs) = x /= '#' && not (isSpace x && content xs)
-content _ = True
-
--- | Load a file containing whitespace delimited patterns stripping out
--- comments lines that start with @#@
-readHyphenationPatternFile :: String -> IO [String]
-readHyphenationPatternFile fn = do
- body <- readFile fn
- return $ filter content (lines body) >>= words
-
--- | Read a built-in language file from the data directory where cabal installed this package.
---
--- (e.g. @hyphenateLanguage \"en\"@ opens @\"\/Users\/ekmett\/.cabal\/lib\/hyphenation-0.1\/ghc-7.4.1\/en.hyp\"@
--- when run on the author's local machine)
-hyphenateLanguage :: String -> IO (String -> [String])
-hyphenateLanguage language = do
- src <- getDataFileName (language ++ ".hyp")
- patterns <- readHyphenationPatternFile src
- return $ hyphenate toLower patterns
-
--- |
--- > ghci> hyphenateEnglish "supercalifragilisticexpialadocious"
--- > ["su","per","cal","ifrag","ilis","tic","ex","pi","al","ado","cious"]
-hyphenateEnglish :: String -> [String]
-hyphenateEnglish = unsafePerformIO (hyphenateLanguage "en")
-
--- |
--- > ghci> hyphenateFrench "anticonstitutionnellement"
--- > ["an","ti","cons","ti","tu","tion","nel","le","ment"]
-hyphenateFrench :: String -> [String]
-hyphenateFrench = unsafePerformIO (hyphenateLanguage "fr")
-
--- |
--- > ghci> hyphenateIcelandic "vaðlaheiðavegavinnuverkfærageymsluskúr"
--- > ["va\240la","hei\240a","vega","vinnu","verk","f\230ra","geymslu","sk\250r"]
-hyphenateIcelandic :: String -> [String]
-hyphenateIcelandic = unsafePerformIO (hyphenateLanguage "is")
+import Text.Hyphenation.Pattern
+import Text.Hyphenation.Exception
+
+defaultLeftMin, defaultRightMin :: Int
+defaultLeftMin = 2
+defaultRightMin = 3
+
+data Hyphenator = Hyphenator
+ { hyphenatorChars :: Char -> Char
+ , hyphenatorPatterns :: Patterns
+ , hyphenatorExceptions :: Exceptions
+ , hyphenatorLeftMin :: {-# UNPACK #-} !Int
+ , hyphenatorRightMin :: {-# UNPACK #-} !Int
+ }
+
+hyphenationScore :: Hyphenator -> String -> [Int]
+hyphenationScore (Hyphenator nf ps es l r) s
+ | l + r >= n = replicate (n + 1) 0
+ | otherwise = case lookupException ls es of
+ Just pts -> trim pts
+ Nothing -> trim (lookupPattern ls ps)
+ where
+ trim result = replicate l 0 ++ take (n - l - r) (drop l result) ++ replicate r 0 -- drop the final replicate?
+ n = length s
+ ls = map nf s
+
+hyphenate :: Hyphenator -> String -> [String]
+hyphenate h s0 = go [] s0 $ tail $ hyphenationScore h s0 where
+ go acc (w:ws) (p:ps)
+ | odd p = reverse (w:acc) : go [] ws ps
+ | otherwise = go (w:acc) ws ps
+ go acc [] _ = [reverse acc]
+ go acc ws [] = [reverse acc ++ ws]
@@ -0,0 +1,58 @@
+-----------------------------------------------------------------------------
+-- |
+-- Module : Text.Hyphenation.Exception
+-- Copyright : (C) 2012 Edward Kmett,
+-- License : BSD-style (see the file LICENSE)
+--
+-- Maintainer : Edward Kmett <ekmett@gmail.com>
+-- Stability : provisional
+-- Portability : portable
+--
+----------------------------------------------------------------------------
+module Text.Hyphenation.Exception
+ (
+ -- * Pattern file support
+ Exceptions
+ , addException
+ , lookupException
+ , scoreException
+ , parseExceptions
+ ) where
+
+import qualified Data.HashMap.Strict as HM
+import Data.Monoid
+import Prelude hiding (lookup)
+
+-- manually supplied hyphenations
+newtype Exceptions = Exceptions (HM.HashMap String [Int])
+ deriving Show
+
+zipMin :: [Int] -> [Int] -> [Int]
+zipMin (x:xs) (y:ys) = min x y : zipMin xs ys
+zipMin _ _ = []
+
+-- | Exceptions permit an exact list of hyphenation locations
+-- but merging exceptions is used to restrict the set when both contain the same word
+instance Monoid Exceptions where
+ mempty = Exceptions mempty
+ Exceptions m `mappend` Exceptions n = Exceptions (HM.unionWith zipMin m n)
+
+-- | add an exception to the exception table.
+-- if it is already present, this will restrict the set of hyphenations to the
+-- intersection of the set provided and the set present.
+addException :: String -> Exceptions -> Exceptions
+addException s (Exceptions m) = Exceptions $
+ HM.insertWith zipMin (filter (/= '-') s) (scoreException s) m
+
+lookupException :: String -> Exceptions -> Maybe [Int]
+lookupException s (Exceptions m) = HM.lookup s m
+
+scoreException :: String -> [Int]
+scoreException [] = [0]
+scoreException (x:ys)
+ | x == '-' = 1 : if null ys then [] else scoreException (tail ys)
+ | otherwise = 0 : scoreException ys
+
+-- parse one exception per line from an input string
+parseExceptions :: String -> Exceptions
+parseExceptions = foldr addException mempty . lines
Oops, something went wrong.

0 comments on commit ef1c3b9

Please sign in to comment.