Skip to content

Commit

Permalink
Handle wide characters properly
Browse files Browse the repository at this point in the history
As it turns out, there is even more to Unicode than we were aware of.
Specifically, Unicode has a notion of wide characters and those can span
multiple columns in a terminal. If such a character is displayed, we
once again end up with a wrongly displayed selection cursor.
This change fixes the problem by taking into account the width of
characters when calculating selection indexes.
  • Loading branch information
d-e-s-o committed May 21, 2023
1 parent 7711026 commit e55441c
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 7 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ features = ["fs", "io-util", "macros", "rt"]
[dependencies.unicode-segmentation]
version = "1.10"

[dependencies.unicode-width]
version = "0.1.10"

[dependencies.uuid]
version = "1.2"
default-features = false
Expand Down
55 changes: 48 additions & 7 deletions src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,46 @@
// SPDX-License-Identifier: GPL-3.0-or-later

use std::cmp::min;
use std::ops::ControlFlow;
use std::ops::RangeFrom;

use unicode_segmentation::UnicodeSegmentation as _;
use unicode_width::UnicodeWidthStr as _;


/// Find the byte index that maps to the given character position.
fn byte_index(string: &str, position: usize) -> usize {
let extended = true;
string
.grapheme_indices(extended)
.map(|(byte_idx, _grapheme)| byte_idx)
.nth(position)
.unwrap_or(string.len())
let result =
string
.grapheme_indices(extended)
.try_fold(0usize, |total_width, (byte_idx, grapheme)| {
if total_width >= position {
ControlFlow::Break(byte_idx)
} else {
ControlFlow::Continue(total_width + grapheme.width())
}
});

match result {
ControlFlow::Break(byte_idx) => byte_idx,
ControlFlow::Continue(_) => string.len(),
}
}

/// Find the character index that maps to the given byte position.
fn char_index(string: &str, byte_position: usize) -> usize {
let extended = true;
string
.grapheme_indices(extended)
.take_while(|(idx, grapheme)| byte_position >= idx + grapheme.len())
.count()
.map_while(|(idx, grapheme)| {
if byte_position >= idx + grapheme.len() {
Some(grapheme.width())
} else {
None
}
})
.sum()
}


Expand All @@ -34,6 +52,11 @@ fn char_index(string: &str, byte_position: usize) -> usize {
/// it's a grapheme cluster in Unicode speak. All indexes, unless
/// explicitly denoted otherwise, are relative to these characters and
/// not to bytes.
///
/// Please note that at the moment, selections take into account
/// character width. That is arguably more of a property pertaining the
/// specific output in use, and so we are effectively specific to
/// terminal based use cases at the moment.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct Line {
/// The string representing the line.
Expand Down Expand Up @@ -184,6 +207,15 @@ mod tests {
assert_eq!(byte_index(s, 5), 10);
assert_eq!(byte_index(s, 6), 16);
assert_eq!(byte_index(s, 7), 16);

let s = "a|b";
assert_eq!(byte_index(s, 0), 0);
assert_eq!(byte_index(s, 1), 1);
assert_eq!(byte_index(s, 2), 4);
assert_eq!(byte_index(s, 3), 4);
assert_eq!(byte_index(s, 4), 5);
assert_eq!(byte_index(s, 5), 5);
assert_eq!(byte_index(s, 6), 5);
}

/// Check that our "character" indexing works as it should.
Expand All @@ -210,6 +242,15 @@ mod tests {
assert_eq!(char_index(s, 1), 0);
assert_eq!(char_index(s, 6), 1);
assert_eq!(char_index(s, 7), 2);

let s = "a|b";
assert_eq!(char_index(s, 0), 0);
assert_eq!(char_index(s, 1), 1);
assert_eq!(char_index(s, 2), 1);
assert_eq!(char_index(s, 3), 1);
assert_eq!(char_index(s, 4), 3);
assert_eq!(char_index(s, 5), 4);
assert_eq!(char_index(s, 6), 4);
}

/// Check that `Line::substr` behaves as it should.
Expand Down

0 comments on commit e55441c

Please sign in to comment.