-
Notifications
You must be signed in to change notification settings - Fork 96
/
Tokenizer`1.cs
114 lines (102 loc) · 4.72 KB
/
Tokenizer`1.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// Copyright 2016-2018 Datalust, Superpower Contributors, Sprache Contributors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
using System;
using System.Collections.Generic;
using Superpower.Display;
using Superpower.Model;
namespace Superpower
{
/// <summary>
/// Base class for tokenizers, types whose instances convert strings into lists of tokens.
/// </summary>
/// <typeparam name="TKind">The kind of tokens produced.</typeparam>
public abstract class Tokenizer<TKind>
{
/// <summary>
/// Tokenize <paramref name="source"/>.
/// </summary>
/// <param name="source">The source to tokenize.</param>
/// <returns>The list of tokens or an error.</returns>
/// <exception cref="ParseException">Tokenization failed.</exception>
public TokenList<TKind> Tokenize(string source)
{
var result = TryTokenize(source);
if (result.HasValue)
return result.Value;
throw new ParseException(result.ToString());
}
/// <summary>
/// Tokenize <paramref name="source"/>.
/// </summary>
/// <param name="source">The source to tokenize.</param>
/// <returns>A result with the list of tokens or an error.</returns>
/// <exception cref="ArgumentNullException"><paramref name="source"/> is null.</exception>
/// <exception cref="ParseException">The tokenizer could not correctly perform tokenization.</exception>
public Result<TokenList<TKind>> TryTokenize(string source)
{
if (source == null) throw new ArgumentNullException(nameof(source));
var state = new TokenizationState<TKind>();
var sourceSpan = new TextSpan(source);
var remainder = sourceSpan;
var results = new List<Token<TKind>>();
foreach (var result in Tokenize(sourceSpan, state))
{
if (!result.HasValue)
return Result.CastEmpty<TKind, TokenList<TKind>>(result);
if (result.Remainder == remainder) // Broken parser, not a failed parsing.
throw new ParseException($"Zero-width tokens are not supported; token {Presentation.FormatExpectation(result.Value)} at position {result.Location.Position}.");
remainder = result.Remainder;
var token = new Token<TKind>(result.Value, result.Location.Until(result.Remainder));
state.Previous = token;
results.Add(token);
}
var value = new TokenList<TKind>(results.ToArray());
return Result.Value(value, sourceSpan, remainder);
}
/// <summary>
/// Subclasses should override to perform tokenization.
/// </summary>
/// <param name="span">The input span to tokenize.</param>
/// <returns>A list of parsed tokens.</returns>
protected virtual IEnumerable<Result<TKind>> Tokenize(TextSpan span)
{
throw new NotImplementedException("Either `Tokenize(TextSpan)` or `Tokenize(TextSpan, TokenizationState)` must be implemented.");
}
/// <summary>
/// Subclasses should override to perform tokenization when the
/// last-produced-token needs to be tracked.
/// </summary>
/// <param name="span">The input span to tokenize.</param>
/// <param name="state">The tokenization state maintained during the operation.</param>
/// <returns>A list of parsed tokens.</returns>
protected virtual IEnumerable<Result<TKind>> Tokenize(TextSpan span, TokenizationState<TKind> state)
{
return Tokenize(span);
}
/// <summary>
/// Advance until the first non-whitespace character is encountered.
/// </summary>
/// <param name="span">The span to advance from.</param>
/// <returns>A result with the first non-whitespace character.</returns>
protected static Result<char> SkipWhiteSpace(TextSpan span)
{
var next = span.ConsumeChar();
while (next.HasValue && char.IsWhiteSpace(next.Value))
{
next = next.Remainder.ConsumeChar();
}
return next;
}
}
}