Handle wide characters properly

As it turns out, there is even more to Unicode than we were aware of. Specifically, Unicode has a notion of wide characters and those can span multiple columns in a terminal. If such a character is displayed, we once again end up with a wrongly displayed selection cursor. This change fixes the problem by taking into account the width of characters when calculating selection indexes.
d-e-s-o · May 21, 2023 · e55441c · e55441c
1 parent 7711026
commit e55441c
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 7 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -108,6 +108,9 @@ features = ["fs", "io-util", "macros", "rt"]
 [dependencies.unicode-segmentation]
 version = "1.10"
 
+[dependencies.unicode-width]
+version = "0.1.10"
+
 [dependencies.uuid]
 version = "1.2"
 default-features = false

diff --git a/src/line.rs b/src/line.rs
@@ -2,28 +2,46 @@
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 use std::cmp::min;
+use std::ops::ControlFlow;
 use std::ops::RangeFrom;
 
 use unicode_segmentation::UnicodeSegmentation as _;
+use unicode_width::UnicodeWidthStr as _;
 
 
 /// Find the byte index that maps to the given character position.
 fn byte_index(string: &str, position: usize) -> usize {
   let extended = true;
-  string
-    .grapheme_indices(extended)
-    .map(|(byte_idx, _grapheme)| byte_idx)
-    .nth(position)
-    .unwrap_or(string.len())
+  let result =
+    string
+      .grapheme_indices(extended)
+      .try_fold(0usize, |total_width, (byte_idx, grapheme)| {
+        if total_width >= position {
+          ControlFlow::Break(byte_idx)
+        } else {
+          ControlFlow::Continue(total_width + grapheme.width())
+        }
+      });
+
+  match result {
+    ControlFlow::Break(byte_idx) => byte_idx,
+    ControlFlow::Continue(_) => string.len(),
+  }
 }
 
 /// Find the character index that maps to the given byte position.
 fn char_index(string: &str, byte_position: usize) -> usize {
   let extended = true;
   string
     .grapheme_indices(extended)
-    .take_while(|(idx, grapheme)| byte_position >= idx + grapheme.len())
-    .count()
+    .map_while(|(idx, grapheme)| {
+      if byte_position >= idx + grapheme.len() {
+        Some(grapheme.width())
+      } else {
+        None
+      }
+    })
+    .sum()
 }
 
 
@@ -34,6 +52,11 @@ fn char_index(string: &str, byte_position: usize) -> usize {
 /// it's a grapheme cluster in Unicode speak. All indexes, unless
 /// explicitly denoted otherwise, are relative to these characters and
 /// not to bytes.
+///
+/// Please note that at the moment, selections take into account
+/// character width. That is arguably more of a property pertaining the
+/// specific output in use, and so we are effectively specific to
+/// terminal based use cases at the moment.
 #[derive(Clone, Debug, Default, Eq, PartialEq)]
 pub struct Line {
   /// The string representing the line.
@@ -184,6 +207,15 @@ mod tests {
     assert_eq!(byte_index(s, 5), 10);
     assert_eq!(byte_index(s, 6), 16);
     assert_eq!(byte_index(s, 7), 16);
+
+    let s = "a｜b";
+    assert_eq!(byte_index(s, 0), 0);
+    assert_eq!(byte_index(s, 1), 1);
+    assert_eq!(byte_index(s, 2), 4);
+    assert_eq!(byte_index(s, 3), 4);
+    assert_eq!(byte_index(s, 4), 5);
+    assert_eq!(byte_index(s, 5), 5);
+    assert_eq!(byte_index(s, 6), 5);
   }
 
   /// Check that our "character" indexing works as it should.
@@ -210,6 +242,15 @@ mod tests {
     assert_eq!(char_index(s, 1), 0);
     assert_eq!(char_index(s, 6), 1);
     assert_eq!(char_index(s, 7), 2);
+
+    let s = "a｜b";
+    assert_eq!(char_index(s, 0), 0);
+    assert_eq!(char_index(s, 1), 1);
+    assert_eq!(char_index(s, 2), 1);
+    assert_eq!(char_index(s, 3), 1);
+    assert_eq!(char_index(s, 4), 3);
+    assert_eq!(char_index(s, 5), 4);
+    assert_eq!(char_index(s, 6), 4);
   }
 
   /// Check that `Line::substr` behaves as it should.