Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overhaul how regex generated code is structured #66432

Merged
merged 4 commits into from Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view

Large diffs are not rendered by default.

Expand Up @@ -178,10 +178,10 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
}

// Parse the input pattern
RegexTree tree;
RegexTree regexTree;
try
{
tree = RegexParser.Parse(pattern, regexOptions | RegexOptions.Compiled, culture); // make sure Compiled is included to get all optimizations applied to it
regexTree = RegexParser.Parse(pattern, regexOptions | RegexOptions.Compiled, culture); // make sure Compiled is included to get all optimizations applied to it
}
catch (Exception e)
{
Expand All @@ -192,37 +192,36 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
string? ns = regexMethodSymbol.ContainingType?.ContainingNamespace?.ToDisplayString(
SymbolDisplayFormat.FullyQualifiedFormat.WithGlobalNamespaceStyle(SymbolDisplayGlobalNamespaceStyle.Omitted));

var regexType = new RegexType(
typeDec is RecordDeclarationSyntax rds ? $"{typeDec.Keyword.ValueText} {rds.ClassOrStructKeyword}" : typeDec.Keyword.ValueText,
ns ?? string.Empty,
$"{typeDec.Identifier}{typeDec.TypeParameterList}");

var regexMethod = new RegexMethod(
regexType,
methodSyntax,
regexMethodSymbol.Name,
methodSyntax.Modifiers.ToString(),
pattern,
regexOptions,
matchTimeout ?? Timeout.Infinite,
tree);

var regexType = new RegexType(
regexMethod,
typeDec is RecordDeclarationSyntax rds ? $"{typeDec.Keyword.ValueText} {rds.ClassOrStructKeyword}" : typeDec.Keyword.ValueText,
ns ?? string.Empty,
$"{typeDec.Identifier}{typeDec.TypeParameterList}");
regexTree);

RegexType current = regexType;
var parent = typeDec.Parent as TypeDeclarationSyntax;

while (parent is not null && IsAllowedKind(parent.Kind()))
{
current.ParentClass = new RegexType(
null,
current.Parent = new RegexType(
parent is RecordDeclarationSyntax rds2 ? $"{parent.Keyword.ValueText} {rds2.ClassOrStructKeyword}" : parent.Keyword.ValueText,
ns ?? string.Empty,
$"{parent.Identifier}{parent.TypeParameterList}");

current = current.ParentClass;
current = current.Parent;
parent = parent.Parent as TypeDeclarationSyntax;
}

return regexType;
return regexMethod;

static bool IsAllowedKind(SyntaxKind kind) =>
kind == SyntaxKind.ClassDeclaration ||
Expand All @@ -233,12 +232,16 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
}

/// <summary>A regex method.</summary>
internal sealed record RegexMethod(MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexTree Tree);
internal sealed record RegexMethod(RegexType DeclaringType, MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexTree Tree)
{
public int GeneratedId { get; set; }
public string GeneratedName => $"{MethodName}_{GeneratedId}";
}

/// <summary>A type holding a regex method.</summary>
internal sealed record RegexType(RegexMethod? Method, string Keyword, string Namespace, string Name)
internal sealed record RegexType(string Keyword, string Namespace, string Name)
{
public RegexType? ParentClass { get; set; }
public RegexType? Parent { get; set; }
}
}
}
240 changes: 208 additions & 32 deletions src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs
Expand Up @@ -2,11 +2,13 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.CodeDom.Compiler;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Diagnostics.Tracing;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
Expand All @@ -23,65 +25,239 @@ namespace System.Text.RegularExpressions.Generator
[Generator(LanguageNames.CSharp)]
public partial class RegexGenerator : IIncrementalGenerator
{
public void Initialize(IncrementalGeneratorInitializationContext context)
/// <summary>Name of the type emitted to contain helpers used by the generated code.</summary>
private const string HelpersTypeName = "Utilities";
/// <summary>Namespace containing all the generated code.</summary>
private const string GeneratedNamespace = "System.Text.RegularExpressions.Generated";
/// <summary>Code for a [GeneratedCode] attribute to put on the top-level generated members.</summary>
private static readonly string s_generatedCodeAttribute = $"GeneratedCodeAttribute(\"{typeof(RegexGenerator).Assembly.GetName().Name}\", \"{typeof(RegexGenerator).Assembly.GetName().Version}\")";
/// <summary>Header comments and usings to include at the top of every generated file.</summary>
private static readonly string[] s_headers = new string[]
{
// To avoid invalidating the generator's output when anything from the compilation
// changes, we will extract from it the only thing we care about: whether unsafe
// code is allowed.
IncrementalValueProvider<bool> allowUnsafeProvider =
context.CompilationProvider
.Select((x, _) => x.Options is CSharpCompilationOptions { AllowUnsafe: true });
"// <auto-generated/>",
"#nullable enable",
"#pragma warning disable CS0162 // Unreachable code",
"#pragma warning disable CS0164 // Unreferenced label",
"#pragma warning disable CS0219 // Variable assigned but never used",
};

// Contains one entry per regex method, either the generated code for that regex method,
// a diagnostic to fail with, or null if no action should be taken for that regex.
public void Initialize(IncrementalGeneratorInitializationContext context)
{
// Produces one entry per generated regex. This may be:
// - Diagnostic in the case of a failure that should end the compilation
// - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers) in the case of valid regex
// - (RegexMethod regexMethod, string reason, Diagnostic diagnostic) in the case of a limited-support regex
IncrementalValueProvider<ImmutableArray<object?>> codeOrDiagnostics =
context.SyntaxProvider

// Find all MethodDeclarationSyntax nodes attributed with RegexGenerator and gather the required information
// Find all MethodDeclarationSyntax nodes attributed with RegexGenerator and gather the required information.
.CreateSyntaxProvider(IsSyntaxTargetForGeneration, GetSemanticTargetForGeneration)
.Where(static m => m is not null)

// Pair each with whether unsafe code is allowed
.Combine(allowUnsafeProvider)

// Get the resulting code string or error Diagnostic for
// each MethodDeclarationSyntax/allow-unsafe-blocks pair
// Generate the RunnerFactory for each regex, if possible. This is where the bulk of the implementation occurs.
.Select((state, _) =>
{
Debug.Assert(state.Left is not null);
return state.Left is RegexType regexType ? EmitRegexType(regexType, state.Right) : state.Left;
if (state is not RegexMethod regexMethod)
{
Debug.Assert(state is Diagnostic);
return state;
}

// If we're unable to generate a full implementation for this regex, report a diagnostic.
// We'll still output a limited implementation that just caches a new Regex(...).
if (!regexMethod.Tree.Root.SupportsCompilation(out string? reason))
{
return (regexMethod, reason, Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, regexMethod.MethodSyntax.GetLocation()));
}

// Generate the core logic for the regex.
Dictionary<string, string[]> requiredHelpers = new();
var sw = new StringWriter();
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
var writer = new IndentedTextWriter(sw);
writer.Indent += 3;
writer.WriteLine();
EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers);
writer.Indent -= 3;
return (regexMethod, sw.ToString(), requiredHelpers);
})
.Collect();

// To avoid invalidating every regex's output when anything from the compilation changes,
// we extract from it the only things we care about: whether unsafe code is allowed,
// and a name based on the assembly's name, and only that information is then fed into
// RegisterSourceOutput along with all of the cached generated data from each regex.
IncrementalValueProvider<(bool AllowUnsafe, string? AssemblyName)> compilationDataProvider =
context.CompilationProvider
.Select((x, _) => (x.Options is CSharpCompilationOptions { AllowUnsafe: true }, x.AssemblyName));

// When there something to output, take all the generated strings and concatenate them to output,
// and raise all of the created diagnostics.
context.RegisterSourceOutput(codeOrDiagnostics, static (context, results) =>
context.RegisterSourceOutput(codeOrDiagnostics.Combine(compilationDataProvider), static (context, compilationDataAndResults) =>
{
var code = new List<string>(s_headers.Length + results.Length);

// Add file header and required usings
code.AddRange(s_headers);
ImmutableArray<object?> results = compilationDataAndResults.Left;

// Report any top-level diagnostics.
bool allFailures = true;
foreach (object? result in results)
{
switch (result)
if (result is Diagnostic d)
{
case Diagnostic d:
context.ReportDiagnostic(d);
break;
context.ReportDiagnostic(d);
}
else
{
allFailures = false;
}
}
if (allFailures)
{
return;
}

// At this point we'll be emitting code. Create a writer to hold it all.
var sw = new StringWriter();
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
IndentedTextWriter writer = new(sw);

// Add file headers and required usings.
foreach (string header in s_headers)
{
writer.WriteLine(header);
}
writer.WriteLine();

case ValueTuple<string, ImmutableArray<Diagnostic>> t:
code.Add(t.Item1);
foreach (Diagnostic d in t.Item2)
// For every generated type, we give it an incrementally increasing ID, in order to create
// unique type names even in situations where method names were the same, while also keeping
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
// the type names short. Note that this is why we only generate the RunnerFactory implementations
// earlier in the pipeline... we want to avoid generating code that relies on the class names
// until we're able to iterate through them linearly keeping track of a deterministic ID
// used to name them. The boilerplate code generation that happens here is minimal when compared to
// the work required to generate the actual matching code for the regex.
int id = 0;
string generatedClassName = $"__{ComputeStringHash(compilationDataAndResults.Right.AssemblyName ?? ""):x}";

// If we have any (RegexMethod regexMethod, string generatedName, string reason, Diagnostic diagnostic), these are regexes for which we have
// limited support and need to simply output boilerplate. We need to emit their diagnostics.
// If we have any (RegexMethod regexMethod, string generatedName, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers),
// those are generated implementations to be emitted. We need to gather up their required helpers.
Dictionary<string, string[]> requiredHelpers = new();
foreach (object? result in results)
{
RegexMethod? regexMethod = null;
if (result is ValueTuple<RegexMethod, string, Diagnostic> limitedSupportResult)
{
context.ReportDiagnostic(limitedSupportResult.Item3);
regexMethod = limitedSupportResult.Item1;
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
{
foreach (KeyValuePair<string, string[]> helper in regexImpl.Item3)
{
if (!requiredHelpers.ContainsKey(helper.Key))
{
context.ReportDiagnostic(d);
requiredHelpers.Add(helper.Key, helper.Value);
}
break;
}

regexMethod = regexImpl.Item1;
}

if (regexMethod is not null)
{
regexMethod.GeneratedId = id++;
EmitRegexPartialMethod(regexMethod, writer, generatedClassName);
writer.WriteLine();
}
}

// At this point we've emitted all the partial method definitions, but we still need to emit the actual regex-derived implementations.
// These are all emitted inside of our generated class.
// TODO https://github.com/dotnet/csharplang/issues/5529:
// When C# provides a mechanism for shielding generated code from the rest of the project, it should be employed
// here for the generated class. At that point, the generated class wrapper can be removed, and all of the types
// generated inside of it (one for each regex as well as the helpers type) should be shielded.

writer.WriteLine($"namespace {GeneratedNamespace}");
writer.WriteLine($"{{");

// We emit usings here now that we're inside of a namespace block and are no longer emitting code into
// a user's partial type. We can now rely on binding rules mapping to these usings and don't need to
// use global-qualified names for the rest of the implementation.
writer.WriteLine($" using System;");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a new pragma warnings in the headers though? I believe it is common for people to add stylecop rules that require using statements to be outside of a namespace.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such rules by default shouldn't apply to generated code. But even if they did, I don't want us to add pragmas for warnings that aren't in the .NET SDK; otherwise we open the doors to an unending laundry list of arbitrary 3rd-party analyzers we'd need to start considering.

Note there is a hidden diagnostic issued for this by the C# compiler (CS8019), but pragmas appear to be ineffective at disabling it. @sharwell may know why.

Copy link
Member

@sharwell sharwell Mar 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CS8019

This is a hidden diagnostic used for fading out unnecessary usings in the IDE. It is intentionally reported even in generated files, and will not impact builds. IDE0005 is the complementary rule that is only reported in non-generated files, and might be raised to warning/error in builds.

Do we need a new pragma warnings in the headers though?

As long as you use a hint name ending in .g.cs or have // <auto-generated/> as the first line of the file, it will be treated as generated code and code style analyzers will not run in the file.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks.

It is intentionally reported even in generated files

I was surprised that they were still issued even with:

#pragma warning disable CS8019

at the top of the file. Is that expected?

As long as you use a hint name ending in .g.cs or have // as the first line of the file

We do both.

writer.WriteLine($" using System.CodeDom.Compiler;");
writer.WriteLine($" using System.Collections;");
writer.WriteLine($" using System.ComponentModel;");
writer.WriteLine($" using System.Globalization;");
writer.WriteLine($" using System.Runtime.CompilerServices;");
writer.WriteLine($" using System.Text.RegularExpressions;");
writer.WriteLine($" using System.Threading;");
writer.WriteLine($"");
if (compilationDataAndResults.Right.AllowUnsafe)
{
writer.WriteLine($" [SkipLocalsInit]");
}
writer.WriteLine($" [{s_generatedCodeAttribute}]");
writer.WriteLine($" [EditorBrowsable(EditorBrowsableState.Never)]");
writer.WriteLine($" internal static class {generatedClassName}");
writer.WriteLine($" {{");

// Emit each Regex-derived type.
writer.Indent += 2;
foreach (object? result in results)
{
if (result is ValueTuple<RegexMethod, string, Diagnostic> limitedSupportResult)
{
EmitRegexLimitedBoilerplate(writer, limitedSupportResult.Item1, limitedSupportResult.Item1.GeneratedId, limitedSupportResult.Item2);
writer.WriteLine();
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
{
EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item1.GeneratedId, regexImpl.Item2);
writer.WriteLine();
}
}
writer.Indent -= 2;

// If any of the Regex-derived types asked for helper methods, emit those now.
if (requiredHelpers.Count != 0)
{
writer.Indent += 2;
writer.WriteLine($"private static class {HelpersTypeName}");
writer.WriteLine($"{{");
writer.Indent++;
foreach (KeyValuePair<string, string[]> helper in requiredHelpers)
{
foreach (string value in helper.Value)
{
writer.WriteLine(value);
}
writer.WriteLine();
}
writer.Indent--;
writer.WriteLine($"}}");
writer.Indent -= 2;
}

context.AddSource("RegexGenerator.g.cs", string.Join(Environment.NewLine, code));
writer.WriteLine($" }}");
writer.WriteLine($"}}");

// Save out the source
context.AddSource("RegexGenerator.g.cs", sw.ToString());
});
}

/// <summary>Computes a hash of the string.</summary>
/// <remarks>
/// Currently an FNV-1a hash function. The actual algorithm used doesn't matter; just something
/// simple to create a deterministic, pseudo-random value that's based on input text.
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
/// </remarks>
private static uint ComputeStringHash(string s)
{
uint hashCode = 2166136261;
foreach (char c in s)
{
hashCode = (c ^ hashCode) * 16777619;
}
return hashCode;
}
}
}
Expand Up @@ -13,6 +13,7 @@
<IsNETCoreAppAnalyzer>true</IsNETCoreAppAnalyzer>
<AnalyzerLanguage>cs</AnalyzerLanguage>
<IsPackable>false</IsPackable>
<LangVersion>Preview</LangVersion>
</PropertyGroup>

<ItemGroup>
Expand Down