diff --git a/.github/actions/spelling/block-delimiters.list b/.github/actions/spelling/block-delimiters.list new file mode 100644 index 00000000..3d4798ce --- /dev/null +++ b/.github/actions/spelling/block-delimiters.list @@ -0,0 +1,3 @@ +# block ignore + + diff --git a/.github/actions/spelling/expect/README.md.txt b/.github/actions/spelling/expect/README.md.txt index 160041fa..617efedb 100644 --- a/.github/actions/spelling/expect/README.md.txt +++ b/.github/actions/spelling/expect/README.md.txt @@ -1,6 +1,4 @@ gsutil -ikea -microsoft spammed timeframe workflows diff --git a/action.yml b/action.yml index 8f9b8994..49cd6b7d 100644 --- a/action.yml +++ b/action.yml @@ -208,7 +208,7 @@ inputs: warnings: description: "List of events that are warnings (items that are neither warnings nor notices will result in an :x:)" required: false - default: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration + default: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,unclosed-block-ignore-begin,unclosed-block-ignore-end notices: description: "List of events that are notices (items that are neither warnings nor notices will result in an :x:)" required: false diff --git a/lib/CheckSpelling/UnknownWordSplitter.pm b/lib/CheckSpelling/UnknownWordSplitter.pm index 9405ae4a..bf31e5eb 100644 --- a/lib/CheckSpelling/UnknownWordSplitter.pm +++ b/lib/CheckSpelling/UnknownWordSplitter.pm @@ -20,6 +20,9 @@ use CheckSpelling::Util; our $VERSION='0.1.0'; my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names); +my $begin_block_re = ''; +my @begin_block_list = (); +my @end_block_list = (); my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern); my ($shortest, $longest) = (255, 0); my @forbidden_re_list; @@ -94,6 +97,30 @@ sub not_empty { return defined $thing && $thing ne '' } +sub parse_block_list { + my ($re) = @_; + my @file; + return @file unless (open(FILE, '<:utf8', $re)); + + local $/=undef; + my $file=; + my $last_line = $.; + close FILE; + for (split /\R/, $file) { + next if /^#/; + chomp; + next unless /^./; + push @file, $_; + } + + unless ($#file % 2 == 1) { + print STDERR "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n"; + @file = (); + } + + return @file; +} + sub valid_word { # shortest_word is an absolute our ($shortest, $longest, $shortest_word, $longest_word); @@ -178,6 +205,7 @@ sub hunspell_dictionary { sub init { my ($configuration) = @_; our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re); + our ($begin_block_re, @begin_block_list, @end_block_list); our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', ''); our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30); if ($hunspell_dictionary_path) { @@ -191,6 +219,23 @@ sub init { print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n"; } } + + if (-e "$configuration/block-delimiters.list") { + my @block_delimiters = parse_block_list "$configuration/block-delimiters.list"; + if (@block_delimiters) { + @begin_block_list = (); + @end_block_list = (); + + while (@block_delimiters) { + my ($begin, $end) = splice @block_delimiters, 0, 2; + push @begin_block_list, $begin; + push @end_block_list, $end; + } + + $begin_block_re = join '|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list); + } + } + my (@patterns_re_list, %in_patterns_re_list); if (-e "$configuration/patterns.txt") { @patterns_re_list = file_to_list "$configuration/patterns.txt"; @@ -306,6 +351,7 @@ sub split_file { $unrecognized, $shortest, $largest_file, $words, $word_match, %unique, %unique_unrecognized, $forbidden_re, @forbidden_re_list, $patterns_re, %dictionary, + $begin_block_re, @begin_block_list, @end_block_list, $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file ); our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern); @@ -370,8 +416,9 @@ sub split_file { local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required alarm $timeout; + my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', ''); my $offset = 0; - while () { + LINE: while () { $_ = decode_utf8($_, FB_DEFAULT); if (/[\x{D800}-\x{DFFF}]/) { skip_file($temp_dir, "file contains a UTF-16 surrogate. This is not supported. (utf16-surrogate)\n"); @@ -381,6 +428,30 @@ sub split_file { s/^\x{FEFF}// if $. == 1; next unless /./; my $raw_line = $_; + my $parsed_block_markers; + + # hook for custom multiline based text exclusions: + if ($begin_block_re) { + FIND_END_MARKER: while (1) { + while ($next_end_marker ne '') { + next LINE unless /\Q$next_end_marker\E/; + s/.*?\Q$next_end_marker\E//; + ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', ''); + $parsed_block_markers = 1; + } + my @captured = (/^.*?$begin_block_re/); + last unless (@captured); + for my $capture (0 .. $#captured) { + if ($captured[$capture]) { + ($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1"); + s/^.*?\Q$begin_block_list[$capture]\E//; + $parsed_block_markers = 1; + next FIND_END_MARKER; + } + } + } + next if $parsed_block_markers; + } # hook for custom line based text exclusions: if (defined $patterns_re) { @@ -484,6 +555,12 @@ sub split_file { } } } + if ($next_end_marker) { + if ($start_marker_line) { + print WARNINGS ":$start_marker_line, Warning - failed to find matching end marker for `$current_begin_marker` (unclosed-block-ignore-begin)\n"; + } + print WARNINGS ":$.:1 ... 1, Warning - expected to find end block marker `$next_end_marker` (unclosed-block-ignore-end)\n"; + } alarm 0; }; diff --git a/sarif.json b/sarif.json index 7815bcff..c9df93ca 100644 --- a/sarif.json +++ b/sarif.json @@ -556,6 +556,58 @@ "code-reviews" ] } + }, + { + "id": "unclosed-block-ignore-begin", + "name": "UnclosedBlockIgnoreBegin", + "helpUri": "https://github.com/check-spelling/check-spelling/wiki/Event-descriptions#unclosed-block-ignore-begin", + "shortDescription": { + "text": "Unclosed block ignore (begin)" + }, + "fullDescription": { + "text": "A begin block ignore was found but not a corresponding end block ignore. This is associated with the found begin mark." + }, + "help": { + "text": "?", + "markdown": "**Remediation (click \"Show more\" below)**:\n\n- Check to see if the content has text that should correspond to the end block ignore but does not, if so, correct it.\n- If the begin block ignore is too general, consider making it more specific. See [block ignore examples](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-block-delimiters) and [block ignore feature](https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore)\n\n" + }, + "defaultConfiguration": { + "level": "warning" + }, + "properties": { + "precision": "low", + "problem.severity": "warning", + "tags": [ + "source-code", + "code-reviews" + ] + } + }, + { + "id": "unclosed-block-ignore-end", + "name": "UnclosedBlockIgnoreEnd", + "helpUri": "https://github.com/check-spelling/check-spelling/wiki/Event-descriptions#unclosed-block-ignore-end", + "shortDescription": { + "text": "Unclosed block ignore (end)" + }, + "fullDescription": { + "text": "A begin block ignore was found but not a corresponding end block ignore. This is associated with the missing end mark." + }, + "help": { + "text": "?", + "markdown": "**Remediation (click \"Show more\" below)**:\n\n- Check to see if the content has text that should correspond to the end block ignore but does not, if so, correct it.\n- If the begin block ignore is too general, consider making it more specific. See [block ignore examples](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-block-delimiters) and [block ignore feature](https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore)\n\n" + }, + "defaultConfiguration": { + "level": "warning" + }, + "properties": { + "precision": "low", + "problem.severity": "warning", + "tags": [ + "source-code", + "code-reviews" + ] + } } ] } diff --git a/t/UnknownWordSplitter.t b/t/UnknownWordSplitter.t index 3e20ffc2..2262c929 100755 --- a/t/UnknownWordSplitter.t +++ b/t/UnknownWordSplitter.t @@ -11,7 +11,7 @@ use File::Temp qw/ tempfile tempdir /; use IO::Capture::Stderr; use Test::More; -plan tests => 42; +plan tests => 55; use_ok('CheckSpelling::UnknownWordSplitter'); diff --git a/unknown-words.sh b/unknown-words.sh index 60a62961..097c6f9b 100755 --- a/unknown-words.sh +++ b/unknown-words.sh @@ -1085,6 +1085,7 @@ define_variables() { patterns="$splitter_configuration/patterns.txt" forbidden_path="$splitter_configuration/forbidden.txt" candidates_path="$splitter_configuration/candidates.txt" + block_delimiters_path="$splitter_configuration/block-delimiters.list"; excludes="$spellchecker/excludes.txt" excludes_path="$temp/excludes.txt" only="$spellchecker/only.txt" @@ -2015,6 +2016,8 @@ set_up_files() { fi get_project_files line_forbidden.patterns "$forbidden_path" get_project_files candidate.patterns "$candidates_path" + + get_project_files block-delimiters.list "$block_delimiters_path" fi extra_dictionaries_cover_entries="$(mktemp)" get_project_files line_masks.patterns "$patterns_path"