Make isRangeValid take Lang as input

Summary: There are different implementations of isRangeValid that work well for different languages, thus it makes sense to facilitate having different implementations based on the language. Reviewed By: patapizza Differential Revision: D28362777 fbshipit-source-id: 5f2991d54af3095c8e95cf534e2dd3b4a34dee3a
facebook · May 17, 2021 · 69d9512 · 69d9512
1 parent 7762af8
commit 69d9512
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## 0.2.X.X
 
 ### Core
+* Make `isRangeValid` behave differently based on lang
 
 ### Rulesets
 * CA (Catalan)

diff --git a/Duckling/Engine.hs b/Duckling/Engine.hs
@@ -27,6 +27,7 @@ import qualified Data.List as L
 import qualified Text.Regex.PCRE as PCRE
 
 import Duckling.Dimensions.Types
+import Duckling.Locale
 import Duckling.Regex.Types
 import Duckling.Resolve
 import Duckling.Types hiding (regex)
@@ -40,11 +41,11 @@ import qualified Duckling.Types.Stash as Stash
 -- Engine
 
 parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
-parseAndResolve rules input context options =
+parseAndResolve rules input context@Context{locale = Locale lang _} options =
   mapMaybe
     (resolveNode context options)
     $ force $ Stash.toPosOrderedList
-      $ runDuckling $ parseString rules (Document.fromText input)
+      $ runDuckling $ parseString lang rules (Document.fromText input)
 
 type Duckling a = Identity a
 
@@ -66,45 +67,45 @@ resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
     , isLatent = latent
     }
 
-parseString :: [Rule] -> Document -> Duckling Stash
-parseString rules sentence = do
+parseString :: Lang -> [Rule] -> Document -> Duckling Stash
+parseString lang rules sentence = do
   (new, partialMatches) <-
     -- One the first pass we try all the rules
-    parseString1 rules sentence Stash.empty Stash.empty []
+    parseString1 lang rules sentence Stash.empty Stash.empty []
   if Stash.null new
     then return Stash.empty
     else
     -- For subsequent passes, we only try rules starting with a predicate.
-    saturateParseString headPredicateRules sentence new new partialMatches
+    saturateParseString lang headPredicateRules sentence new new partialMatches
   where
   headPredicateRules =
     [ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
 
 -- | Produces all tokens recursively.
 saturateParseString
-  :: [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
-saturateParseString rules sentence stash new matches = do
-  (new', matches') <- parseString1 rules sentence stash new matches
+  :: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
+saturateParseString lang rules sentence stash new matches = do
+  (new', matches') <- parseString1 lang rules sentence stash new matches
   let stash' = Stash.union stash new'
   if Stash.null new'
     then return stash
-    else saturateParseString rules sentence stash' new' matches'
+    else saturateParseString lang rules sentence stash' new' matches'
 
 -- | Finds new matches resulting from newly added tokens.
 -- Produces new tokens from full matches.
 parseString1
-  :: [Rule] -> Document -> Stash -> Stash -> [Match]
+  :: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match]
   -> Duckling (Stash, [Match])
-parseString1 rules sentence stash new matches = do
+parseString1 lang rules sentence stash new matches = do
   -- Recursively match patterns.
   -- Find which `matches` can advance because of `new`.
-  newPartial <- concatMapM (matchFirst sentence new) matches
+  newPartial <- concatMapM (matchFirst sentence lang new) matches
 
   -- Find new matches resulting from newly added tokens (`new`)
-  newMatches <- concatMapM (matchFirstAnywhere sentence new) rules
+  newMatches <- concatMapM (matchFirstAnywhere sentence lang new) rules
 
   (full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
-    <$> matchAll sentence stash (newPartial ++ newMatches)
+    <$> matchAll sentence lang stash (newPartial ++ newMatches)
 
   -- Produce full matches as new tokens
   return ( Stash.fromList $ mapMaybe produce full
@@ -113,13 +114,13 @@ parseString1 rules sentence stash new matches = do
 
 -- | Recursively augments `matches`.
 -- Discards partial matches stuck by a regex.
-matchAll :: Document -> Stash -> [Match] -> Duckling [Match]
-matchAll sentence stash matches = concatMapM mkNextMatches matches
+matchAll :: Document -> Lang -> Stash -> [Match] -> Duckling [Match]
+matchAll sentence lang stash matches = concatMapM mkNextMatches matches
   where
     mkNextMatches :: Match -> Duckling [Match]
     mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
     mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
-      nextMatches <- matchAll sentence stash =<< matchFirst sentence stash match
+      nextMatches <- matchAll sentence lang stash =<< matchFirst sentence lang stash match
       return $ case p of
         Regex _ -> nextMatches
         Predicate _ -> match:nextMatches
@@ -140,37 +141,37 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
 
 -- | Returns all matches matching the first pattern item of `match`,
 -- resuming from a Match position
-matchFirst :: Document -> Stash -> Match -> Duckling [Match]
-matchFirst _ _ (Rule {pattern = []}, _, _) = return []
-matchFirst sentence stash (rule@Rule{pattern = p : ps}, position, route) =
-  map (mkMatch route newRule) <$> lookupItem sentence p stash position
+matchFirst :: Document -> Lang -> Stash -> Match -> Duckling [Match]
+matchFirst _ _ _ (Rule {pattern = []}, _, _) = return []
+matchFirst sentence lang stash (rule@Rule{pattern = p : ps}, position, route) =
+  map (mkMatch route newRule) <$> lookupItem sentence lang p stash position
   where
   newRule = rule { pattern = ps }
 
 -- | Returns all matches matching the first pattern item of `match`,
 -- starting anywhere
-matchFirstAnywhere :: Document -> Stash -> Rule -> Duckling [Match]
-matchFirstAnywhere _sentence _stash Rule {pattern = []} = return []
-matchFirstAnywhere sentence stash rule@Rule{pattern = p : ps} =
-  map (mkMatch [] newRule) <$> lookupItemAnywhere sentence p stash
+matchFirstAnywhere :: Document -> Lang -> Stash -> Rule -> Duckling [Match]
+matchFirstAnywhere _sentence _lang _stash Rule {pattern = []} = return []
+matchFirstAnywhere sentence lang stash rule@Rule{pattern = p : ps} =
+  map (mkMatch [] newRule) <$> lookupItemAnywhere sentence lang p stash
   where
   newRule = rule { pattern = ps }
 
 -- | Handle one PatternItem at a given position
-lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
-lookupItem doc (Regex re) _ position =
+lookupItem :: Document -> Lang -> PatternItem -> Stash -> Int -> Duckling [Node]
+lookupItem doc lang (Regex re) _ position =
   filter (isPositionValid position doc) <$>
-  lookupRegex doc re position
-lookupItem doc (Predicate p) stash position =
+  lookupRegex doc lang re position
+lookupItem doc _lang (Predicate p) stash position =
   return $
   filter (p . token) $
   takeWhile (isPositionValid position doc) $
   Stash.toPosOrderedListFrom stash position
 
 -- | Handle one PatternItem anywhere in the text
-lookupItemAnywhere :: Document -> PatternItem -> Stash -> Duckling [Node]
-lookupItemAnywhere doc (Regex re) _ = lookupRegexAnywhere doc re
-lookupItemAnywhere _doc (Predicate p) stash =
+lookupItemAnywhere :: Document -> Lang -> PatternItem -> Stash -> Duckling [Node]
+lookupItemAnywhere doc lang (Regex re) _ = lookupRegexAnywhere doc lang re
+lookupItemAnywhere _doc _lang (Predicate p) stash =
   return $ filter (p . token) $ Stash.toPosOrderedList stash
 
 isPositionValid :: Int -> Document -> Node -> Bool
@@ -184,25 +185,26 @@ mkMatch route newRule node@Node{nodeRange = Range _ pos'} =
   where newRoute = node:route
 
 -- | Handle a regex match at a given position
-lookupRegex :: Document -> PCRE.Regex -> Int -> Duckling [Node]
-lookupRegex doc _regex position | position >= Document.length doc = return []
-lookupRegex doc regex position =
-  lookupRegexCommon doc regex position Regex.matchOnce
+lookupRegex :: Document -> Lang -> PCRE.Regex -> Int -> Duckling [Node]
+lookupRegex doc _lang _regex position | position >= Document.length doc = return []
+lookupRegex doc lang regex position =
+  lookupRegexCommon doc lang regex position Regex.matchOnce
 
 -- | Handle a regex match anywhere in the text
-lookupRegexAnywhere :: Document -> PCRE.Regex -> Duckling [Node]
-lookupRegexAnywhere doc regex = lookupRegexCommon doc regex 0 Regex.matchAll
+lookupRegexAnywhere :: Document -> Lang -> PCRE.Regex -> Duckling [Node]
+lookupRegexAnywhere doc lang regex = lookupRegexCommon doc lang regex 0 Regex.matchAll
 
 {-# INLINE lookupRegexCommon #-}
 -- INLINE bloats the code a bit, but the code is better
 lookupRegexCommon
   :: Foldable t
   => Document
+  -> Lang
   -> PCRE.Regex
   -> Int
   -> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
   -> Duckling [Node]
-lookupRegexCommon doc regex position matchFun = return nodes
+lookupRegexCommon doc lang regex position matchFun = return nodes
   where
   -- See Note [Regular expressions and Text] from Document.hs to understand
   -- what's going on here
@@ -215,7 +217,7 @@ lookupRegexCommon doc regex position matchFun = return nodes
   f [] = Nothing
   f ((0,0):_) = Nothing
   f ((bsStart, bsLen):groups) =
-    if Document.isRangeValid doc start end
+    if Document.isRangeValid lang doc start end
       then Just node
       else Nothing
     where

diff --git a/Duckling/Types/Document.hs b/Duckling/Types/Document.hs
@@ -6,6 +6,7 @@
 
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE RecordWildCards #-}
 
 module Duckling.Types.Document
@@ -32,6 +33,7 @@ import qualified Data.Text.Encoding as Text
 import qualified Data.Text as Text
 import qualified Data.Text.Internal.Unsafe.Char as UText
 
+import Duckling.Locale (Lang(..))
 
 data Document = Document
   { rawInput :: !Text
@@ -114,20 +116,24 @@ fromText rawInput = Document{..}
 
 -- As regexes are matched without whitespace delimitator, we need to check
 -- the reasonability of the match to actually be a word.
-isRangeValid :: Document -> Int -> Int -> Bool
-isRangeValid doc start end =
-  (start == 0 ||
-      isDifferent (doc ! (start - 1)) (doc ! start)) &&
-  (end == length doc ||
-      isDifferent (doc ! (end - 1)) (doc ! end))
+isRangeValid :: Lang -> Document -> Int -> Int -> Bool
+isRangeValid = \case
+  _ -> defaultIsRangeValid
   where
-    charClass :: Char -> Char
-    charClass c
-      | Char.isLower c || Char.isUpper c = 'c'
-      | Char.isDigit c = 'd'
-      | otherwise = c
-    isDifferent :: Char -> Char -> Bool
-    isDifferent a b = charClass a /= charClass b
+    defaultIsRangeValid :: Document -> Int -> Int -> Bool
+    defaultIsRangeValid doc start end =
+      (start == 0 ||
+        isDifferent (doc ! (start - 1)) (doc ! start)) &&
+      (end == length doc ||
+        isDifferent (doc ! (end - 1)) (doc ! end))
+      where
+        charClass :: Char -> Char
+        charClass c
+          | Char.isLower c || Char.isUpper c = 'c'
+          | Char.isDigit c = 'd'
+          | otherwise = c
+        isDifferent :: Char -> Char -> Bool
+        isDifferent a b = charClass a /= charClass b
 
 -- True iff a is followed by whitespaces and b.
 isAdjacent :: Document -> Int -> Int -> Bool

diff --git a/tests/Duckling/Engine/Tests.hs b/tests/Duckling/Engine/Tests.hs
@@ -17,6 +17,7 @@ import Test.Tasty
 import Test.Tasty.HUnit
 
 import Duckling.Engine
+import Duckling.Locale (Lang(..))
 import Duckling.Types
 import Duckling.Regex.Types
 
@@ -30,15 +31,15 @@ emptyRegexTest :: TestTree
 emptyRegexTest = testCase "Empty Regex Test" $
   case regex "()" of
     Regex regex -> assertEqual "empty result" [] $
-      runDuckling $ lookupRegexAnywhere "hey" regex
+      runDuckling $ lookupRegexAnywhere "hey" EN regex
     _ -> assertFailure "expected a regex"
 
 unicodeAndRegexTest :: TestTree
 unicodeAndRegexTest = testCase "Unicode and Regex Test" $
   case regex "\\$([0-9]*)" of
     Regex regex -> do --
       assertEqual "" expected $
-        runDuckling $ lookupRegexAnywhere "\128526 $35" regex
+        runDuckling $ lookupRegexAnywhere "\128526 $35" EN regex
     _ -> assertFailure "expected a regex"
   where
   expected =