dotnet · vatsan-madhavan · Mar 12, 2020 · Apr 10, 2020 · Apr 13, 2020 · Apr 13, 2020
diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/PresentationFramework.csproj b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/PresentationFramework.csproj
@@ -1086,6 +1086,7 @@
     <Compile Include="System\Windows\Documents\ValidationHelper.cs" />
     <Compile Include="System\Windows\Documents\WinEventHandler.cs" />
     <Compile Include="System\Windows\Documents\WinRTSpellerInterop.cs" />
+    <Compile Include="System\Windows\Documents\WinRTSpellerInteropExtensions.cs" />
     <Compile Include="System\Windows\Documents\WpfPayload.cs" />
     <Compile Include="System\Windows\Documents\XamlAttribute.cs" />
     <Compile Include="System\Windows\Documents\XamlRtfConverter.cs" />

diff --git a/...rosoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/NLGSpellerInterop.cs b/...rosoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/NLGSpellerInterop.cs
@@ -643,6 +643,12 @@ private void EnumerateSubSegments()
 
             #region SpellerInteropBase.ISpellerSegment
 
+            /// <inheritdoc/>
+            public string SourceString { get; }
+
+            /// <inheritdoc/>
+            public string Text => SourceString?.Substring(TextRange.Start, TextRange.Length);
+
             /// <summary>
             /// Returns a read-only list of sub-segments of this segment
             /// </summary>

diff --git a/...osoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/SpellerInteropBase.cs b/...osoft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/SpellerInteropBase.cs
@@ -35,6 +35,11 @@ internal interface ITextRange
         /// </summary>
         internal interface ISpellerSegment
         {
+            /// <summary>
+            /// Source String for which <see cref="TextRange"/> provides a position
+            /// </summary>
+            string SourceString { get; }
+
             /// <summary>
             /// Identifies sub-words, if any. 
             /// </summary>
@@ -45,6 +50,11 @@ internal interface ISpellerSegment
             /// </summary>
             ITextRange TextRange { get; }
 
+            /// <summary>
+            /// Text represented by <see cref="TextRange"/>
+            /// </summary>
+            string Text { get; }
+
             /// <summary>
             /// Queries the spell-checker to obtain suggestions for this segment
             /// </summary>

diff --git a/...soft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInterop.cs b/...soft.DotNet.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInterop.cs
@@ -852,14 +852,24 @@ private SpellChecker CurrentSpellChecker
 
         #endregion Private Fields
 
-        #region Private Types
+        #region Internal Types
 
-        private struct TextRange: SpellerInteropBase.ITextRange
+        internal readonly struct TextRange: SpellerInteropBase.ITextRange
         {
             public TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSegment)
+                :this((int) textSegment.StartPosition, (int) textSegment.Length)
+            {
+            }
+
+            public TextRange(int start, int length)
+            {
+                _start = start;
+                _length = length;
+            }
+
+            public TextRange(ITextRange textRange) : 
+                this(textRange.Start, textRange.Length)
             {
-                _length = (int)textSegment.Length;
-                _start = (int)textSegment.StartPosition;
             }
 
             public static explicit operator TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSegment)
@@ -886,16 +896,18 @@ public int Length
         }
 
         [DebuggerDisplay("SubSegments.Count = {SubSegments.Count} TextRange = {TextRange.Start},{TextRange.Length}")]
-        private class SpellerSegment: ISpellerSegment
+        internal class SpellerSegment: ISpellerSegment
         {
             #region Constructor
 
-            public SpellerSegment(WordSegment segment, SpellChecker spellChecker, WinRTSpellerInterop owner)
+            public SpellerSegment(string sourceString, ITextRange textRange, SpellChecker spellChecker, WinRTSpellerInterop owner)
             {
-                _segment = segment;
                 _spellChecker = spellChecker;
                 _suggestions = null;
-                _owner = owner;
+
+                Owner = owner;
+                SourceString = sourceString;
+                TextRange = textRange;
             }
 
             static SpellerSegment()
@@ -920,9 +932,9 @@ private void EnumerateSuggestions()
 
                 List<SpellChecker.SpellingError> spellingErrors = null;
 
-                using (new SpellerCOMActionTraceLogger(_owner, SpellerCOMActionTraceLogger.Actions.ComprehensiveCheck))
+                using (new SpellerCOMActionTraceLogger(Owner, SpellerCOMActionTraceLogger.Actions.ComprehensiveCheck))
                 {
-                    spellingErrors = _spellChecker.ComprehensiveCheck(_segment.Text);
+                    spellingErrors = Text != null ? _spellChecker.ComprehensiveCheck(Text) : null;
                 }
 
                 if (spellingErrors == null)
@@ -947,6 +959,16 @@ private void EnumerateSuggestions()
 
             #region SpellerInteropBase.ISpellerSegment
 
+            /// <summary>
+            /// <inheritdoc/>
+            /// </summary>
+            public string SourceString { get; }
+
+            /// <summary>
+            /// <inheritdoc/>
+            /// </summary>
+            public string Text => SourceString?.Substring(TextRange.Start, TextRange.Length);
+
             /// <summary>
             /// Returns a read-only list of sub-segments of this segment
             /// WinRT word-segmenter doesn't really support sub-segments,
@@ -960,13 +982,7 @@ public IReadOnlyList<ISpellerSegment> SubSegments
                 }
             }
 
-            public ITextRange TextRange
-            {
-                get
-                {
-                    return new TextRange(_segment.SourceTextSegment);
-                }
-            }
+            public ITextRange TextRange { get; }
 
             public IReadOnlyList<string> Suggestions
             {
@@ -994,6 +1010,13 @@ public bool IsClean
                 }
             }
 
+            /// <remarks>
+            /// This field is used only to support TraceLogging telemetry
+            /// logged using <see cref="SpellerCOMActionTraceLogger"/>. It
+            /// has no other functional use.
+            /// </remarks>
+            internal WinRTSpellerInterop Owner { get; }
+
             public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object data)
             {
                 bool result = true;
@@ -1008,24 +1031,20 @@ public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object dat
 
             #region Private Fields
 
-            private WordSegment _segment;
 
             SpellChecker _spellChecker;
             private IReadOnlyList<string> _suggestions;
             private bool? _isClean = null;
 
             private static readonly IReadOnlyList<ISpellerSegment> _empty;
 
-            /// <remarks>
-            /// This field is used only to support TraceLogging telemetry
-            /// logged using <see cref="SpellerCOMActionTraceLogger"/>. It
-            /// has no other functional use.
-            /// </remarks>
-            private WinRTSpellerInterop _owner;
-
             #endregion Private Fields
         }
 
+        #endregion Internal Types
+
+        #region Private Types
+
         [DebuggerDisplay("Sentence = {_sentence}")]
         private class SpellerSentence: ISpellerSentence
         {
@@ -1046,14 +1065,7 @@ public IReadOnlyList<ISpellerSegment> Segments
                 {
                     if (_segments == null)
                     {
-                        List<SpellerSegment> segments = new List<SpellerSegment>();
-
-                        foreach (var wordSegment in _wordBreaker.GetTokens(_sentence))
-                        {
-                            segments.Add(new SpellerSegment(wordSegment, _spellChecker, _owner));
-                        }
-
-                        _segments = segments.AsReadOnly();
+                        _segments = _wordBreaker.ComprehensiveGetTokens(_sentence, _spellChecker, _owner);
                     }
 
                     return _segments;

diff --git a/...t.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInteropExtensions.cs b/...t.Wpf/src/PresentationFramework/System/Windows/Documents/WinRTSpellerInteropExtensions.cs
@@ -0,0 +1,191 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using MS.Internal.WindowsRuntime.Windows.Data.Text;
+using System.Collections.Generic;
+
+using System.Windows.Documents.MsSpellCheckLib;
+using System.Windows.Documents.Tracing;
+using static System.Windows.Documents.WinRTSpellerInterop;
+
+namespace System.Windows.Documents
+{
+    internal static class WinRTSpellerInteropExtensions
+    {
+        /// <summary>
+        /// Tokenizes <paramref name="text"/> using <paramref name="segmenter"/>, and then identifies fixes-up
+        /// the tokens to account for any missed text "in-between" those tokens. 
+        /// </summary>
+        /// <param name="segmenter">Word-breaker instance</param>
+        /// <param name="text">The text being tokenized</param>
+        /// <param name="spellChecker">The spell-checker instance used to augment the tokenizing process</param>
+        /// <param name="owner">Calling <see cref="WinRTSpellerInterop"/> instance</param>
+        /// <returns></returns>
+        /// <remarks>
+        /// Windows.Data.Text.WordsSegmenter tends to drop punctuation characters like period ('.') 
+        /// when tokenizing text. Though this behavior is compatible with a vast majority of text-processing
+        /// scenarios (like word-counting), it is not ideal for spell-checking. 
+        /// 
+        /// In this method, the following <paramref name="spellChecker"/> augmented heuristic is applied to update the token-list generated by 
+        /// <paramref name="segmenter"/>. 
+        /// 
+        ///  - Identify if any text 'missingFragment' has been dropped by the <paramref name="segmenter"/>
+        ///  - If the token immediately preceding 'missingFragment', previousToken, has a spelling error, then attempt to 
+        ///     create new candiate tokens in the following order:
+        ///     
+        ///             previousToken + missingFragment[0..0]
+        ///             previousToken + missingFragment[0..1]
+        ///             previousToken + missingFragment[0..2]
+        ///             ...
+        ///             ...
+        ///             previousToken + missingFragment[0..LEN-1], where LEN = LEN(missingFragment)
+        ///             
+        ///  - Select the first candidate token that is free of spelling errors, and replace 'previousToken' with it. 
+        ///  - For performance reasons, we choose a constant MAXLEN = 4 such that when LEN > MAXLEN, only MAXLEN
+        ///     tokens are considered. 
+        ///     - MAXLEN = 4 is a somewhat arbitrary choice, though it seems more than sufficient to address common 
+        ///       problems this heuristic is intended to help with. 
+        ///       
+        ///     - Typical word-breaking problems that have been observed empirically involve only one missed character,
+        ///       for which MAXLEN=1 would be sufficient. MAXLEN=4 is chosen as a sufficiently-large tradeoff between
+        ///       correctness and performance. 
+        ///       
+        ///     - Also see https://github.com/dotnet/wpf/pull/2753#issuecomment-602120768 for a discussion related to this. 
+        /// </remarks>
+        public static IReadOnlyList<SpellerSegment> ComprehensiveGetTokens(
+            this WordsSegmenter segmenter,
+            string text,
+            SpellChecker spellChecker,
+            WinRTSpellerInterop owner)
+        {
+            IReadOnlyList<WordSegment> tokens = segmenter?.GetTokens(text) ?? Array.Empty<WordSegment>();
+            if (tokens.Count == 0)
+            {
+                return Array.Empty<SpellerSegment>();
+            }
+
+            var allTokens = new List<SpellerSegment>();
+            int predictedNextTokenStartPosition = 0;
+
+            for (int i = 0; i < tokens.Count; i++)
+            {
+                int nextTokenStartPosition = (int)tokens[i].SourceTextSegment.StartPosition;
+                int nextTokenLength = (int)tokens[i].SourceTextSegment.Length;
+
+                if (spellChecker != null)
+                {
+                    if (nextTokenStartPosition > predictedNextTokenStartPosition)
+                    {
+                        // There is a "gap" between the last recorded token and the current token.
+                        // Identify the missing token and add it as a "supplementary word segment" - but only if the token
+                        // turns out to be a substantial one (i.e., if the string is non-blank/non-empty). 
+                        var missingFragment =
+                            new SpellerSegment(
+                                text,
+                                new WinRTSpellerInterop.TextRange(
+                                    predictedNextTokenStartPosition,
+                                    nextTokenStartPosition - predictedNextTokenStartPosition),
+                                spellChecker,
+                                owner);
+                        if (allTokens.Count > 0)
+                        {
+                            var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment);
+                            if (substToken != null)
+                            {
+                                allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner);
+                            }
+                        }
+                    }
+                }
+
+
+                allTokens.Add(
+                    new SpellerSegment(
+                        text,
+                        new WinRTSpellerInterop.TextRange(
+                            nextTokenStartPosition,
+                            nextTokenLength),
+                        spellChecker,
+                        owner));
+                predictedNextTokenStartPosition = nextTokenStartPosition + nextTokenLength;
+            }
+
+            if (tokens.Count > 0 &&
+                spellChecker?.ComprehensiveCheck(tokens[tokens.Count - 1].Text)?.Count != 0 &&
+                predictedNextTokenStartPosition < text.Length)
+            {
+                // There is a token possibly missing at the end of the string
+                var missingFragment =
+                    new SpellerSegment(
+                        text,
+                        new WinRTSpellerInterop.TextRange(
+                            predictedNextTokenStartPosition,
+                            text.Length - predictedNextTokenStartPosition),
+                        spellChecker,
+                        owner);
+
+                if (allTokens.Count > 0)
+                {
+                    var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment);
+                    if (substToken != null)
+                    {
+                        allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner);
+                    }
+                }
+            }
+
+            return allTokens.AsReadOnly();
+        }
+
+        /// <summary>
+        /// Checks through combinations of <paramref name="lastToken"/> + substrings(<paramref name="missingFragment"/>) and 
+        /// returns the first spellcheck-clean result. 
+        /// </summary>
+        /// <param name="spellChecker">Spell-checker</param>
+        /// <param name="documentText">Overall document text within which the text-ranges are computed</param>
+        /// <param name="lastToken">Previous token immediately preceding <paramref name="missingFragment"/></param>
+        /// <param name="missingFragment">The missing-fragment identified immediately after <paramref name="lastToken"/></param>
+        /// <returns></returns>
+        /// <remarks>
+        /// See note about MAXLEN in <see cref="ComprehensiveGetTokens(WordsSegmenter, string, SpellChecker, WinRTSpellerInterop)"/>
+        /// which explains the rationale behind the value of the constant AlternateFormsMaximumCount. 
+        /// </remarks>
+        private static WinRTSpellerInterop.TextRange? GetSpellCheckCleanSubstitutionToken( 
+            SpellChecker spellChecker, 
+            string documentText,
+            SpellerSegment lastToken,
+            SpellerSegment missingFragment)
+        {
+            const int AlternateFormsMaximumCount = 4;
+
+            if (string.IsNullOrWhiteSpace(missingFragment?.Text) ||
+                string.IsNullOrWhiteSpace(lastToken?.Text) ||
+                string.IsNullOrWhiteSpace(documentText))
+            {
+                return null;
+            }
+
+            int altFormsCount = Math.Min(missingFragment.TextRange.Length, AlternateFormsMaximumCount);
+            var spellingErrors = spellChecker?.ComprehensiveCheck(lastToken.Text);
+            if (spellingErrors?.Count != 0)
+            {
+                // One of the substring-permutations of the missingFragment - when concatenated with 'lastToken' - could be a viable
+                // replacement for 'lastToken'
+                for (int i = 1; i <= altFormsCount; i++)
+                {
+                    var altForm = documentText.Substring(lastToken.TextRange.Start, lastToken.TextRange.Length + i).TrimEnd();
+                    if (spellChecker?.ComprehensiveCheck(altForm)?.Count == 0)
+                    {
+                        // Use this altForm in place lastToken
+                        return new WinRTSpellerInterop.TextRange(
+                            lastToken.TextRange.Start,
+                            altForm.Length);
+                    }
+                }
+            }
+
+            return null;
+        }
+    }
+}