Skip to content

Commit

Permalink
Make UTF-8 decoding errors in .cabal files non-fatal
Browse files Browse the repository at this point in the history
Previously we checked for invalid UTF-8 in the first phase of the
parser, which splitting the file up into nested sections and fields.
This meant the whole parser falls over if there is invalid UTF-8
anywhere in the file. Sadly there are already packages on hackage
with invalid UTF-8 so we would fail when parsing the hackage index.
The solution is to move the check into the parsing of the individual
fields and making it a warning not an error. We most typically get
invalid UTF-8 in free text fields like author name, copyright,
description etc so this should work out ok usually.
We now get pretty decent error messages, like:
  Warning: hsx.cabal:5: Invalid UTF-8 text in the 'author' field.
The warning type is now structured so that hackage will be able to
distinguish general non-fatal warnings from UTF-8 decoding problems
which really should be fatal errors for package uploads.
  • Loading branch information
dcoutts committed Mar 27, 2008
1 parent 507ed73 commit c4f0e7c
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 22 deletions.
10 changes: 8 additions & 2 deletions Distribution/PackageDescription/Parse.hs
Expand Up @@ -298,8 +298,14 @@ readAndParseFile readFile' parser verbosity fpath = do
ParseFailed e -> do
let (line, message) = locatedErrorMsg e
dieWithLocation fpath line message
ParseOk ws x -> do
mapM_ (warn verbosity) (reverse ws)
ParseOk warnings x -> do
mapM_ (warn verbosity)
[ case w of
PWarning msg -> msg
UTFWarning line fname ->
fpath ++ ":" ++ show line
++ ": Invalid UTF-8 text in the '" ++ fname ++ "' field."
| w <- reverse warnings ]
return x

readHookedBuildInfo :: Verbosity -> FilePath -> IO HookedBuildInfo
Expand Down
39 changes: 19 additions & 20 deletions Distribution/ParseUtils.hs
Expand Up @@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -}

-- #hide
module Distribution.ParseUtils (
LineNo, PError(..), PWarning, locatedErrorMsg, syntaxError, warning,
LineNo, PError(..), PWarning(..), locatedErrorMsg, syntaxError, warning,
runP, runE, ParseResult(..), catchParseError, parseFail,
Field(..), fName, lineNo,
FieldDescr(..), ppField, ppFields, readFields,
Expand Down Expand Up @@ -83,11 +83,12 @@ type LineNo = Int
data PError = AmbigousParse String LineNo
| NoParse String LineNo
| TabsError LineNo
| UTFError LineNo
| FromString String (Maybe LineNo)
deriving Show

type PWarning = String
data PWarning = PWarning String
| UTFWarning LineNo String
deriving Show

data ParseResult a = ParseFailed PError | ParseOk [PWarning] a
deriving Show
Expand All @@ -111,9 +112,11 @@ parseFail = ParseFailed
runP :: LineNo -> String -> ReadP a a -> String -> ParseResult a
runP line fieldname p s =
case [ x | (x,"") <- results ] of
[a] -> ParseOk [] a
[a] -> ParseOk (utf8Warnings line fieldname s) a
--TODO: what is this double parse thing all about?
-- Can't we just do the all isSpace test the first time?
[] -> case [ x | (x,ys) <- results, all isSpace ys ] of
[a] -> ParseOk [] a
[a] -> ParseOk (utf8Warnings line fieldname s) a
[] -> ParseFailed (NoParse fieldname line)
_ -> ParseFailed (AmbigousParse fieldname line)
_ -> ParseFailed (AmbigousParse fieldname line)
Expand All @@ -122,14 +125,19 @@ runP line fieldname p s =
runE :: LineNo -> String -> ReadE a -> String -> ParseResult a
runE line fieldname p s =
case runReadE p s of
Right a -> ParseOk [] a
Right a -> ParseOk (utf8Warnings line fieldname s) a
Left e -> syntaxError line ("Parse of field '"++fieldname++"' failed ("++e++"): " )

utf8Warnings :: LineNo -> String -> String -> [PWarning]
utf8Warnings line fieldname s =
take 1 [ UTFWarning n fieldname
| (n,l) <- zip [line..] (lines s)
, '\xfffd' `elem` l ]

locatedErrorMsg :: PError -> (Maybe LineNo, String)
locatedErrorMsg (AmbigousParse f n) = (Just n, "Ambiguous parse in field '"++f++"'")
locatedErrorMsg (NoParse f n) = (Just n, "Parse of field '"++f++"' failed: ")
locatedErrorMsg (TabsError n) = (Just n, "Tab used as indentation.")
locatedErrorMsg (UTFError n) = (Just n, "Invalid UTF-8 text.")
locatedErrorMsg (FromString s n) = (n, s)

syntaxError :: LineNo -> String -> ParseResult a
Expand All @@ -138,11 +146,8 @@ syntaxError n s = ParseFailed $ FromString s (Just n)
tabsError :: LineNo -> ParseResult a
tabsError ln = ParseFailed $ TabsError ln

utf8Error :: LineNo -> ParseResult a
utf8Error ln = ParseFailed $ UTFError ln

warning :: String -> ParseResult ()
warning s = ParseOk [s] ()
warning s = ParseOk [PWarning s] ()

-- | Field descriptor. The parameter @a@ parameterizes over where the field's
-- value is stored in.
Expand Down Expand Up @@ -267,15 +272,9 @@ fName (Section _ n _ _) = n
fName _ = error "fname: not a field or section"

readFields :: String -> ParseResult [Field]
readFields input =
case [ n | (n,l) <- zip [1..] ls
, '\xfffd' `elem` l ] of

(n:_) -> utf8Error n

[] -> ifelse
=<< mapM (mkField 0)
=<< mkTree tokens
readFields input = ifelse
=<< mapM (mkField 0)
=<< mkTree tokens

where ls = (lines . normaliseLineEndings) input
tokens = (concatMap tokeniseLine . trimLines) ls
Expand Down

0 comments on commit c4f0e7c

Please sign in to comment.