Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,7 @@
<Compile Include="System\Windows\Documents\ValidationHelper.cs" />
<Compile Include="System\Windows\Documents\WinEventHandler.cs" />
<Compile Include="System\Windows\Documents\WinRTSpellerInterop.cs" />
<Compile Include="System\Windows\Documents\WinRTSpellerInteropExtensions.cs" />
<Compile Include="System\Windows\Documents\WpfPayload.cs" />
<Compile Include="System\Windows\Documents\XamlAttribute.cs" />
<Compile Include="System\Windows\Documents\XamlRtfConverter.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,12 @@ private void EnumerateSubSegments()

#region SpellerInteropBase.ISpellerSegment

/// <inheritdoc/>
public string SourceString { get; }

/// <inheritdoc/>
public string Text => SourceString?.Substring(TextRange.Start, TextRange.Length);

/// <summary>
/// Returns a read-only list of sub-segments of this segment
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ internal interface ITextRange
/// </summary>
internal interface ISpellerSegment
{
/// <summary>
/// Source String for which <see cref="TextRange"/> provides a position
/// </summary>
string SourceString { get; }

/// <summary>
/// Identifies sub-words, if any.
/// </summary>
Expand All @@ -45,6 +50,11 @@ internal interface ISpellerSegment
/// </summary>
ITextRange TextRange { get; }

/// <summary>
/// Text represented by <see cref="TextRange"/>
/// </summary>
string Text { get; }

/// <summary>
/// Queries the spell-checker to obtain suggestions for this segment
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -852,14 +852,24 @@ private SpellChecker CurrentSpellChecker

#endregion Private Fields

#region Private Types
#region Internal Types

private struct TextRange: SpellerInteropBase.ITextRange
internal readonly struct TextRange: SpellerInteropBase.ITextRange
{
public TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSegment)
:this((int) textSegment.StartPosition, (int) textSegment.Length)
{
}

public TextRange(int start, int length)
{
_start = start;
_length = length;
}

public TextRange(ITextRange textRange) :
this(textRange.Start, textRange.Length)
{
_length = (int)textSegment.Length;
_start = (int)textSegment.StartPosition;
}

public static explicit operator TextRange(MS.Internal.WindowsRuntime.Windows.Data.Text.TextSegment textSegment)
Expand All @@ -886,16 +896,18 @@ public int Length
}

[DebuggerDisplay("SubSegments.Count = {SubSegments.Count} TextRange = {TextRange.Start},{TextRange.Length}")]
private class SpellerSegment: ISpellerSegment
internal class SpellerSegment: ISpellerSegment
{
#region Constructor

public SpellerSegment(WordSegment segment, SpellChecker spellChecker, WinRTSpellerInterop owner)
public SpellerSegment(string sourceString, ITextRange textRange, SpellChecker spellChecker, WinRTSpellerInterop owner)
{
_segment = segment;
_spellChecker = spellChecker;
_suggestions = null;
_owner = owner;

Owner = owner;
SourceString = sourceString;
TextRange = textRange;
}

static SpellerSegment()
Expand All @@ -920,9 +932,9 @@ private void EnumerateSuggestions()

List<SpellChecker.SpellingError> spellingErrors = null;

using (new SpellerCOMActionTraceLogger(_owner, SpellerCOMActionTraceLogger.Actions.ComprehensiveCheck))
using (new SpellerCOMActionTraceLogger(Owner, SpellerCOMActionTraceLogger.Actions.ComprehensiveCheck))
{
spellingErrors = _spellChecker.ComprehensiveCheck(_segment.Text);
spellingErrors = Text != null ? _spellChecker.ComprehensiveCheck(Text) : null;
}

if (spellingErrors == null)
Expand All @@ -947,6 +959,16 @@ private void EnumerateSuggestions()

#region SpellerInteropBase.ISpellerSegment

/// <summary>
/// <inheritdoc/>
/// </summary>
public string SourceString { get; }

/// <summary>
/// <inheritdoc/>
/// </summary>
public string Text => SourceString?.Substring(TextRange.Start, TextRange.Length);

/// <summary>
/// Returns a read-only list of sub-segments of this segment
/// WinRT word-segmenter doesn't really support sub-segments,
Expand All @@ -960,13 +982,7 @@ public IReadOnlyList<ISpellerSegment> SubSegments
}
}

public ITextRange TextRange
{
get
{
return new TextRange(_segment.SourceTextSegment);
}
}
public ITextRange TextRange { get; }

public IReadOnlyList<string> Suggestions
{
Expand Down Expand Up @@ -994,6 +1010,13 @@ public bool IsClean
}
}

/// <remarks>
/// This field is used only to support TraceLogging telemetry
/// logged using <see cref="SpellerCOMActionTraceLogger"/>. It
/// has no other functional use.
/// </remarks>
internal WinRTSpellerInterop Owner { get; }

public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object data)
{
bool result = true;
Expand All @@ -1008,24 +1031,20 @@ public void EnumSubSegments(EnumTextSegmentsCallback segmentCallback, object dat

#region Private Fields

private WordSegment _segment;

SpellChecker _spellChecker;
private IReadOnlyList<string> _suggestions;
private bool? _isClean = null;

private static readonly IReadOnlyList<ISpellerSegment> _empty;

/// <remarks>
/// This field is used only to support TraceLogging telemetry
/// logged using <see cref="SpellerCOMActionTraceLogger"/>. It
/// has no other functional use.
/// </remarks>
private WinRTSpellerInterop _owner;

#endregion Private Fields
}

#endregion Internal Types

#region Private Types

[DebuggerDisplay("Sentence = {_sentence}")]
private class SpellerSentence: ISpellerSentence
{
Expand All @@ -1046,14 +1065,7 @@ public IReadOnlyList<ISpellerSegment> Segments
{
if (_segments == null)
{
List<SpellerSegment> segments = new List<SpellerSegment>();

foreach (var wordSegment in _wordBreaker.GetTokens(_sentence))
{
segments.Add(new SpellerSegment(wordSegment, _spellChecker, _owner));
}

_segments = segments.AsReadOnly();
_segments = _wordBreaker.ComprehensiveGetTokens(_sentence, _spellChecker, _owner);
}

return _segments;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using MS.Internal.WindowsRuntime.Windows.Data.Text;
using System.Collections.Generic;

using System.Windows.Documents.MsSpellCheckLib;
using System.Windows.Documents.Tracing;
using static System.Windows.Documents.WinRTSpellerInterop;

namespace System.Windows.Documents
{
internal static class WinRTSpellerInteropExtensions
{
/// <summary>
/// Tokenizes <paramref name="text"/> using <paramref name="segmenter"/>, and then identifies fixes-up
/// the tokens to account for any missed text "in-between" those tokens.
/// </summary>
/// <param name="segmenter">Word-breaker instance</param>
/// <param name="text">The text being tokenized</param>
/// <param name="spellChecker">The spell-checker instance used to augment the tokenizing process</param>
/// <param name="owner">Calling <see cref="WinRTSpellerInterop"/> instance</param>
/// <returns></returns>
/// <remarks>
/// Windows.Data.Text.WordsSegmenter tends to drop punctuation characters like period ('.')
/// when tokenizing text. Though this behavior is compatible with a vast majority of text-processing
/// scenarios (like word-counting), it is not ideal for spell-checking.
///
/// In this method, the following <paramref name="spellChecker"/> augmented heuristic is applied to update the token-list generated by
/// <paramref name="segmenter"/>.
///
/// - Identify if any text 'missingFragment' has been dropped by the <paramref name="segmenter"/>
/// - If the token immediately preceding 'missingFragment', previousToken, has a spelling error, then attempt to
/// create new candiate tokens in the following order:
///
/// previousToken + missingFragment[0..0]
/// previousToken + missingFragment[0..1]
/// previousToken + missingFragment[0..2]
/// ...
/// ...
/// previousToken + missingFragment[0..LEN-1], where LEN = LEN(missingFragment)
///
/// - Select the first candidate token that is free of spelling errors, and replace 'previousToken' with it.
/// - For performance reasons, we choose a constant MAXLEN = 4 such that when LEN > MAXLEN, only MAXLEN
/// tokens are considered.
/// - MAXLEN = 4 is a somewhat arbitrary choice, though it seems more than sufficient to address common
/// problems this heuristic is intended to help with.
///
/// - Typical word-breaking problems that have been observed empirically involve only one missed character,
/// for which MAXLEN=1 would be sufficient. MAXLEN=4 is chosen as a sufficiently-large tradeoff between
/// correctness and performance.
///
/// - Also see https://github.com/dotnet/wpf/pull/2753#issuecomment-602120768 for a discussion related to this.
/// </remarks>
public static IReadOnlyList<SpellerSegment> ComprehensiveGetTokens(
this WordsSegmenter segmenter,
string text,
SpellChecker spellChecker,
WinRTSpellerInterop owner)
{
IReadOnlyList<WordSegment> tokens = segmenter?.GetTokens(text) ?? Array.Empty<WordSegment>();
if (tokens.Count == 0)
{
return Array.Empty<SpellerSegment>();
}

var allTokens = new List<SpellerSegment>();
int predictedNextTokenStartPosition = 0;

for (int i = 0; i < tokens.Count; i++)
{
int nextTokenStartPosition = (int)tokens[i].SourceTextSegment.StartPosition;
int nextTokenLength = (int)tokens[i].SourceTextSegment.Length;

if (spellChecker != null)
{
if (nextTokenStartPosition > predictedNextTokenStartPosition)
{
// There is a "gap" between the last recorded token and the current token.
// Identify the missing token and add it as a "supplementary word segment" - but only if the token
// turns out to be a substantial one (i.e., if the string is non-blank/non-empty).
var missingFragment =
new SpellerSegment(
text,
new WinRTSpellerInterop.TextRange(
predictedNextTokenStartPosition,
nextTokenStartPosition - predictedNextTokenStartPosition),
spellChecker,
owner);
if (allTokens.Count > 0)
{
var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment);
if (substToken != null)
{
allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner);
}
}
}
}


allTokens.Add(
new SpellerSegment(
text,
new WinRTSpellerInterop.TextRange(
nextTokenStartPosition,
nextTokenLength),
spellChecker,
owner));
predictedNextTokenStartPosition = nextTokenStartPosition + nextTokenLength;
}

if (tokens.Count > 0 &&
spellChecker?.ComprehensiveCheck(tokens[tokens.Count - 1].Text)?.Count != 0 &&
predictedNextTokenStartPosition < text.Length)
{
// There is a token possibly missing at the end of the string
var missingFragment =
new SpellerSegment(
text,
new WinRTSpellerInterop.TextRange(
predictedNextTokenStartPosition,
text.Length - predictedNextTokenStartPosition),
spellChecker,
owner);

if (allTokens.Count > 0)
{
var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment);
if (substToken != null)
{
allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner);
}
}
}

return allTokens.AsReadOnly();
}

/// <summary>
/// Checks through combinations of <paramref name="lastToken"/> + substrings(<paramref name="missingFragment"/>) and
/// returns the first spellcheck-clean result.
/// </summary>
/// <param name="spellChecker">Spell-checker</param>
/// <param name="documentText">Overall document text within which the text-ranges are computed</param>
/// <param name="lastToken">Previous token immediately preceding <paramref name="missingFragment"/></param>
/// <param name="missingFragment">The missing-fragment identified immediately after <paramref name="lastToken"/></param>
/// <returns></returns>
/// <remarks>
/// See note about MAXLEN in <see cref="ComprehensiveGetTokens(WordsSegmenter, string, SpellChecker, WinRTSpellerInterop)"/>
/// which explains the rationale behind the value of the constant AlternateFormsMaximumCount.
/// </remarks>
private static WinRTSpellerInterop.TextRange? GetSpellCheckCleanSubstitutionToken(
SpellChecker spellChecker,
string documentText,
SpellerSegment lastToken,
SpellerSegment missingFragment)
{
const int AlternateFormsMaximumCount = 4;

if (string.IsNullOrWhiteSpace(missingFragment?.Text) ||
string.IsNullOrWhiteSpace(lastToken?.Text) ||
string.IsNullOrWhiteSpace(documentText))
{
return null;
}

int altFormsCount = Math.Min(missingFragment.TextRange.Length, AlternateFormsMaximumCount);
var spellingErrors = spellChecker?.ComprehensiveCheck(lastToken.Text);
if (spellingErrors?.Count != 0)
{
// One of the substring-permutations of the missingFragment - when concatenated with 'lastToken' - could be a viable
// replacement for 'lastToken'
for (int i = 1; i <= altFormsCount; i++)
{
var altForm = documentText.Substring(lastToken.TextRange.Start, lastToken.TextRange.Length + i).TrimEnd();
if (spellChecker?.ComprehensiveCheck(altForm)?.Count == 0)
{
// Use this altForm in place lastToken
return new WinRTSpellerInterop.TextRange(
lastToken.TextRange.Start,
altForm.Length);
}
}
}

return null;
}
}
}