From 1d6a7c6e7e6aadc527346aa822f19d8587f714f3 Mon Sep 17 00:00:00 2001 From: tormol Date: Fri, 30 Sep 2016 13:47:47 +0200 Subject: [PATCH] fix(UsageParser): Handle non-ascii names / options. This should also speeds up the parser (except maybe for short options). Multi-codepoint characters still can't be used as short options, but people shouldn't non-ASCII options nyway. Closes #664 --- src/usage_parser.rs | 116 +++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/src/usage_parser.rs b/src/usage_parser.rs index b8344265e90..e33ea0087c0 100644 --- a/src/usage_parser.rs +++ b/src/usage_parser.rs @@ -52,14 +52,12 @@ impl<'a> UsageParser<'a> { loop { debugln!("iter; pos={};", self.pos); self.stop_at(token); - if self.pos < self.usage.len() { - if let Some(c) = self.usage.chars().nth(self.pos) { - match c { - '-' => self.short_or_long(&mut arg), - '.' => self.multiple(&mut arg), - '\'' => self.help(&mut arg), - _ => self.name(&mut arg), - } + if let Some(&c) = self.usage.as_bytes().get(self.pos) { + match c { + b'-' => self.short_or_long(&mut arg), + b'.' => self.multiple(&mut arg), + b'\'' => self.help(&mut arg), + _ => self.name(&mut arg), } } else { break; @@ -82,7 +80,7 @@ impl<'a> UsageParser<'a> { fn name(&mut self, arg: &mut Arg<'a, 'a>) { debugln!("fn=name;"); - if self.usage.chars().nth(self.pos).expect(INTERNAL_ERROR_MSG) == '<' && + if *self.usage.as_bytes().get(self.pos).expect(INTERNAL_ERROR_MSG) == b'<' && !self.explicit_name_set { arg.setb(ArgSettings::Required); } @@ -113,23 +111,17 @@ impl<'a> UsageParser<'a> { } fn stop_at(&mut self, f: F) - where F: Fn(u32) -> bool + where F: Fn(u8) -> bool { debugln!("fn=stop_at;"); self.start = self.pos; - for c in self.usage[self.start..].chars() { - if f(c as u32) { - self.pos += 1; - continue; - } - break; - } + self.pos += self.usage[self.start..].bytes().take_while(|&b| f(b) ).count(); } fn short_or_long(&mut self, arg: &mut Arg<'a, 'a>) { debugln!("fn=short_or_long;"); self.pos += 1; - if self.usage.chars().nth(self.pos).expect(INTERNAL_ERROR_MSG) == '-' { + if *self.usage.as_bytes().get(self.pos).expect(INTERNAL_ERROR_MSG) == b'-' { self.pos += 1; self.long(arg); return; @@ -141,7 +133,7 @@ impl<'a> UsageParser<'a> { debugln!("fn=long;"); self.stop_at(long_end); let name = &self.usage[self.start..self.pos]; - if arg.name.is_empty() || (self.prev == UsageToken::Short && arg.name.len() == 1) { + if !self.explicit_name_set { debugln!("setting name: {}", name); arg.name = name; } @@ -152,41 +144,41 @@ impl<'a> UsageParser<'a> { fn short(&mut self, arg: &mut Arg<'a, 'a>) { debugln!("fn=short;"); - let name = &self.usage[self.pos..self.pos + 1]; - debugln!("setting short: {}", name); - arg.short = Some(name.chars().nth(0).expect(INTERNAL_ERROR_MSG)); + let start = &self.usage[self.pos..]; + let short = start.chars().nth(0).expect(INTERNAL_ERROR_MSG); + debugln!("setting short: {}", short); + arg.short = Some(short); if arg.name.is_empty() { + // --long takes precedence but doesn't set self.explicit_name_set + let name = &start[..short.len_utf8()]; debugln!("setting name: {}", name); arg.name = name; } self.prev = UsageToken::Short; } + // "something..." fn multiple(&mut self, arg: &mut Arg) { debugln!("fn=multiple;"); let mut dot_counter = 1; let start = self.pos; - for c in self.usage[start..].chars() { - match c { - '.' => { - dot_counter += 1; - self.pos += 1; - if dot_counter == 3 { - debugln!("setting multiple"); - arg.setb(ArgSettings::Multiple); - if arg.settings.is_set(ArgSettings::TakesValue) { - arg.setb(ArgSettings::UseValueDelimiter); - arg.unsetb(ArgSettings::ValueDelimiterNotSet); - if arg.val_delim.is_none() { - arg.val_delim = Some(','); - } - } - self.prev = UsageToken::Multiple; - self.pos += 1; - break; + let mut bytes = self.usage[start..].bytes(); + while bytes.next() == Some(b'.') { + dot_counter += 1; + self.pos += 1; + if dot_counter == 3 { + debugln!("setting multiple"); + arg.setb(ArgSettings::Multiple); + if arg.settings.is_set(ArgSettings::TakesValue) { + arg.setb(ArgSettings::UseValueDelimiter); + arg.unsetb(ArgSettings::ValueDelimiterNotSet); + if arg.val_delim.is_none() { + arg.val_delim = Some(','); } } - _ => break, + self.prev = UsageToken::Multiple; + self.pos += 1; + break; } } } @@ -204,26 +196,26 @@ impl<'a> UsageParser<'a> { } #[inline] -fn name_end(b: u32) -> bool { +fn name_end(b: u8) -> bool { // 93(]), 62(>) - b > b']' as u32 || b < b'>' as u32 || (b > b'>' as u32 && b < b']' as u32) + b > b']' || b < b'>' || (b > b'>' && b < b']') } #[inline] -fn token(b: u32) -> bool { +fn token(b: u8) -> bool { // 39('), 45(-), 46(.), 60(<), 91([) - b < 39 || b > 91 || (b > 46 && b < 91 && b != b'<' as u32) || (b > 39 && b < 45) + b < 39 || b > 91 || (b > 46 && b < 91 && b != b'<') || (b > 39 && b < 45) } #[inline] -fn long_end(b: u32) -> bool { - // 39('), 46(.), 60(<), 61(=), 91([) - (b < 39 && (b > 13 && b != b' ' as u32)) || b > 91 || (b > 61 && b < 91) || +fn long_end(b: u8) -> bool { + // 39('), 46(.), 60(<), 61(=), 91([), 32( ) + (b < 39 && (b > 13 && b != b' ')) || b > 91 || (b > 61 && b < 91) || (b > 39 && b < 60 && b != 46) } #[inline] -fn help_start(b: u32) -> bool { +fn help_start(b: u8) -> bool { // 39(') b < 39 || b > 39 } @@ -1220,4 +1212,30 @@ mod test { assert!(c.val_names.is_none()); assert!(c.num_vals.is_none()); } + + #[test] + fn nonascii() { + let a = Arg::from_usage(" 'üñíčöĐ€'"); + assert_eq!(a.name, "ASCII"); + assert_eq!(a.help, Some("üñíčöĐ€")); + let a = Arg::from_usage("<üñíčöĐ€> 'ASCII'"); + assert_eq!(a.name, "üñíčöĐ€"); + assert_eq!(a.help, Some("ASCII")); + let a = Arg::from_usage("<üñíčöĐ€> 'üñíčöĐ€'"); + assert_eq!(a.name, "üñíčöĐ€"); + assert_eq!(a.help, Some("üñíčöĐ€")); + let a = Arg::from_usage("-ø 'ø'"); + assert_eq!(a.name, "ø"); + assert_eq!(a.short, Some('ø')); + assert_eq!(a.help, Some("ø")); + let a = Arg::from_usage("--üñíčöĐ€ 'Nōṫ ASCII'"); + assert_eq!(a.name, "üñíčöĐ€"); + assert_eq!(a.long, Some("üñíčöĐ€")); + assert_eq!(a.help, Some("Nōṫ ASCII")); + let a = Arg::from_usage("[ñämê] --ôpt=[üñíčöĐ€] 'hælp'"); + assert_eq!(a.name, "ñämê"); + assert_eq!(a.long, Some("ôpt")); + assert_eq!(a.val_names.unwrap().values().collect::>(), [&"üñíčöĐ€"]); + assert_eq!(a.help, Some("hælp")); + } }