diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/PresentationFramework.csproj b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/PresentationFramework.csproj index 2336c88b662..4398e142e22 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/PresentationFramework.csproj +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/PresentationFramework.csproj @@ -1085,6 +1085,7 @@ + diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/NLGSpellerInterop.cs b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/NLGSpellerInterop.cs index 2b4ac76b2f6..87b26d83ec3 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/NLGSpellerInterop.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/NLGSpellerInterop.cs @@ -643,6 +643,12 @@ private void EnumerateSubSegments() #region SpellerInteropBase.ISpellerSegment + /// + public string SourceString { get; } + + /// + public string Text => SourceString?.Substring(TextRange.Start, TextRange.Length); + /// /// Returns a read-only list of sub-segments of this segment /// diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/SpellerInteropBase.cs b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/SpellerInteropBase.cs index 56f926de50c..7016a856d4f 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/SpellerInteropBase.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/SpellerInteropBase.cs @@ -35,6 +35,11 @@ internal interface ITextRange /// internal interface ISpellerSegment { + /// + /// Source String for which provides a position + /// + string SourceString { get; } + /// /// Identifies sub-words, if any. /// @@ -45,6 +50,11 @@ internal interface ISpellerSegment /// ITextRange TextRange { get; } + /// + /// Text represented by + /// + string Text { get; } + /// /// Queries the spell-checker to obtain suggestions for this segment /// diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInterop.cs b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInterop.cs index 0f1ff2d0e34..460aa317e4d 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInterop.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInterop.cs @@ -852,9 +852,9 @@ private SpellChecker CurrentSpellChecker #endregion Private Fields - #region Private Types + #region Internal Types - private struct TextRange: SpellerInteropBase.ITextRange + internal readonly struct TextRange: SpellerInteropBase.ITextRange { public TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSegment) { @@ -862,6 +862,17 @@ public TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSe _start = (int)textSegment.StartPosition; } + public TextRange(int start, int length) + { + _start = start; + _length = length; + } + + public TextRange(ITextRange textRange) : + this(textRange.Start, textRange.Length) + { + } + public static explicit operator TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSegment) { return new TextRange(textSegment); @@ -886,16 +897,18 @@ public int Length } [DebuggerDisplay("SubSegments.Count = {SubSegments.Count} TextRange = {TextRange.Start},{TextRange.Length}")] - private class SpellerSegment: ISpellerSegment + internal class SpellerSegment: ISpellerSegment { #region Constructor - public SpellerSegment(WordSegment segment, SpellChecker spellChecker, WinRTSpellerInterop owner) + public SpellerSegment(string sourceString, ITextRange textRange, SpellChecker spellChecker, WinRTSpellerInterop owner) { - _segment = segment; _spellChecker = spellChecker; _suggestions = null; - _owner = owner; + Owner = owner; + + SourceString = sourceString; + TextRange = textRange; } static SpellerSegment() @@ -920,9 +933,9 @@ private void EnumerateSuggestions() List spellingErrors = null; - using (new SpellerCOMActionTraceLogger(_owner, SpellerCOMActionTraceLogger.Actions.ComprehensiveCheck)) + using (new SpellerCOMActionTraceLogger(Owner, SpellerCOMActionTraceLogger.Actions.ComprehensiveCheck)) { - spellingErrors = _spellChecker.ComprehensiveCheck(_segment.Text); + spellingErrors = Text != null ? _spellChecker.ComprehensiveCheck(Text) : null; } if (spellingErrors == null) @@ -947,6 +960,16 @@ private void EnumerateSuggestions() #region SpellerInteropBase.ISpellerSegment + /// + /// + /// + public string SourceString { get; } + + /// + /// + /// + public string Text => SourceString?.Substring(TextRange.Start, TextRange.Length); + /// /// Returns a read-only list of sub-segments of this segment /// WinRT word-segmenter doesn't really support sub-segments, @@ -960,13 +983,7 @@ public IReadOnlyList SubSegments } } - public ITextRange TextRange - { - get - { - return new TextRange(_segment.SourceTextSegment); - } - } + public ITextRange TextRange { get; } public IReadOnlyList Suggestions { @@ -994,6 +1011,13 @@ public bool IsClean } } + /// + /// This field is used only to support TraceLogging telemetry + /// logged using . It + /// has no other functional use. + /// + internal WinRTSpellerInterop Owner { get; } + public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object data) { bool result = true; @@ -1008,7 +1032,6 @@ public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object dat #region Private Fields - private WordSegment _segment; SpellChecker _spellChecker; private IReadOnlyList _suggestions; @@ -1016,16 +1039,13 @@ public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object dat private static readonly IReadOnlyList _empty; - /// - /// This field is used only to support TraceLogging telemetry - /// logged using . It - /// has no other functional use. - /// - private WinRTSpellerInterop _owner; - #endregion Private Fields } + #endregion Internal Types + + #region Private Types + [DebuggerDisplay("Sentence = {_sentence}")] private class SpellerSentence: ISpellerSentence { @@ -1046,14 +1066,7 @@ public IReadOnlyList Segments { if (_segments == null) { - List segments = new List(); - - foreach (var wordSegment in _wordBreaker.GetTokens(_sentence)) - { - segments.Add(new SpellerSegment(wordSegment, _spellChecker, _owner)); - } - - _segments = segments.AsReadOnly(); + _segments = _wordBreaker.ComprehensiveGetTokens(_sentence, _spellChecker, _owner); } return _segments; diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInteropExtensions.cs b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInteropExtensions.cs new file mode 100644 index 00000000000..37ea34c0d86 --- /dev/null +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInteropExtensions.cs @@ -0,0 +1,191 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using MS.Internal.WindowsRuntime.Windows.Data.Text; +using System.Collections.Generic; + +using System.Windows.Documents.MsSpellCheckLib; +using System.Windows.Documents.Tracing; +using static System.Windows.Documents.WinRTSpellerInterop; + +namespace System.Windows.Documents +{ + internal static class WinRTSpellerInteropExtensions + { + /// + /// Tokenizes using , and then identifies fixes-up + /// the tokens to account for any missed text "in-between" those tokens. + /// + /// Word-breaker instance + /// The text being tokenized + /// The spell-checker instance used to augment the tokenizing process + /// Calling instance + /// + /// + /// Windows.Data.Text.WordsSegmenter tends to drop punctuation characters like period ('.') + /// when tokenizing text. Though this behavior is compatible with a vast majority of text-processing + /// scenarios (like word-counting), it is not ideal for spell-checking. + /// + /// In this method, the following augmented heuristic is applied to update the token-list generated by + /// . + /// + /// - Identify if any text 'missingFragment' has been dropped by the + /// - If the token immediately preceding 'missingFragment', previousToken, has a spelling error, then attempt to + /// create new candiate tokens in the following order: + /// + /// previousToken + missingFragment[0..0] + /// previousToken + missingFragment[0..1] + /// previousToken + missingFragment[0..2] + /// ... + /// ... + /// previousToken + missingFragment[0..LEN-1], where LEN = LEN(missingFragment) + /// + /// - Select the first candidate token that is free of spelling errors, and replace 'previousToken' with it. + /// - For performance reasons, we choose a constant MAXLEN = 4 such that when LEN > MAXLEN, only MAXLEN + /// tokens are considered. + /// - MAXLEN = 4 is a somewhat arbitrary choice, though it seems more than sufficient to address common + /// problems this heuristic is intended to help with. + /// + /// - Typical word-breaking problems that have been observed empirically involve only one missed character, + /// for which MAXLEN=1 would be sufficient. MAXLEN=4 is chosen as a sufficiently-large tradeoff between + /// correctness and performance. + /// + /// - Also see https://github.com/dotnet/wpf/pull/2753#issuecomment-602120768 for a discussion related to this. + /// + public static IReadOnlyList ComprehensiveGetTokens( + this WordsSegmenter segmenter, + string text, + SpellChecker spellChecker, + WinRTSpellerInterop owner) + { + IReadOnlyList tokens = segmenter?.GetTokens(text) ?? Array.Empty(); + if (tokens.Count == 0) + { + return Array.Empty(); + } + + var allTokens = new List(); + int predictedNextTokenStartPosition = 0; + + for (int i = 0; i < tokens.Count; i++) + { + int nextTokenStartPosition = (int)tokens[i].SourceTextSegment.StartPosition; + int nextTokenLength = (int)tokens[i].SourceTextSegment.Length; + + if (spellChecker != null) + { + if (nextTokenStartPosition > predictedNextTokenStartPosition) + { + // There is a "gap" between the last recorded token and the current token. + // Identify the missing token and add it as a "supplementary word segment" - but only if the token + // turns out to be a substantial one (i.e., if the string is non-blank/non-empty). + var missingFragment = + new SpellerSegment( + text, + new WinRTSpellerInterop.TextRange( + predictedNextTokenStartPosition, + nextTokenStartPosition - predictedNextTokenStartPosition), + spellChecker, + owner); + if (allTokens.Count > 0) + { + var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment); + if (substToken != null) + { + allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner); + } + } + } + } + + + allTokens.Add( + new SpellerSegment( + text, + new WinRTSpellerInterop.TextRange( + nextTokenStartPosition, + nextTokenLength), + spellChecker, + owner)); + predictedNextTokenStartPosition = nextTokenStartPosition + nextTokenLength; + } + + if (tokens.Count > 0 && + spellChecker?.ComprehensiveCheck(tokens[tokens.Count - 1].Text)?.Count != 0 && + predictedNextTokenStartPosition < text.Length) + { + // There is a token possibly missing at the end of the string + var missingFragment = + new SpellerSegment( + text, + new WinRTSpellerInterop.TextRange( + predictedNextTokenStartPosition, + text.Length - predictedNextTokenStartPosition), + spellChecker, + owner); + + if (allTokens.Count > 0) + { + var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment); + if (substToken != null) + { + allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner); + } + } + } + + return allTokens.AsReadOnly(); + } + + /// + /// Checks through combinations of + substrings() and + /// returns the first spellcheck-clean result. + /// + /// Spell-checker + /// Overall document text within which the text-ranges are computed + /// Previous token immediately preceding + /// The missing-fragment identified immediately after + /// + /// + /// See note about MAXLEN in + /// which explains the rationale behind the value of the constant AlternateFormsMaximumCount. + /// + private static WinRTSpellerInterop.TextRange? GetSpellCheckCleanSubstitutionToken( + SpellChecker spellChecker, + string documentText, + SpellerSegment lastToken, + SpellerSegment missingFragment) + { + const int AlternateFormsMaximumCount = 4; + + if (string.IsNullOrWhiteSpace(missingFragment?.Text) || + string.IsNullOrWhiteSpace(lastToken?.Text) || + string.IsNullOrWhiteSpace(documentText)) + { + return null; + } + + int altFormsCount = Math.Min(missingFragment.TextRange.Length, AlternateFormsMaximumCount); + var spellingErrors = spellChecker?.ComprehensiveCheck(lastToken.Text); + if (spellingErrors?.Count != 0) + { + // One of the substring-permutations of the missingFragment - when concatenated with 'lastToken' - could be a viable + // replacement for 'lastToken' + for (int i = 1; i <= altFormsCount; i++) + { + var altForm = documentText.Substring(lastToken.TextRange.Start, lastToken.TextRange.Length + i).TrimEnd(); + if (spellChecker?.ComprehensiveCheck(altForm)?.Count == 0) + { + // Use this altForm in place lastToken + return new WinRTSpellerInterop.TextRange( + lastToken.TextRange.Start, + altForm.Length); + } + } + } + + return null; + } + } +}