Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning for BIDI characters in page renders and in diffs #17562

Merged
merged 47 commits into from
Jan 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
dca05ee
Add warning for BIDI characters in page renders and in diffs
zeripath Nov 5, 2021
449cb26
as per review
zeripath Nov 5, 2021
40b8628
Adjust to only put the warning on BIDI lines without RTL chars
zeripath Nov 6, 2021
3a63d9d
Another attempt.
zeripath Nov 7, 2021
cd0bb29
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 9, 2021
5f481cf
placate lint
zeripath Nov 9, 2021
c89c678
another placation
zeripath Nov 9, 2021
7e9871c
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 10, 2021
f563ee9
as per review
zeripath Nov 14, 2021
70d446b
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 14, 2021
65dcc39
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 16, 2021
62345ba
fix broken merge
zeripath Nov 16, 2021
831f189
as per silverwind
zeripath Nov 16, 2021
5a9759c
as per silverwind
zeripath Nov 16, 2021
006a5cd
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 16, 2021
63a5e0f
fix class
silverwind Nov 16, 2021
8a01b22
make message header colors work on both themes
silverwind Nov 16, 2021
6449cad
minor styling tweaks
silverwind Nov 16, 2021
ab03673
fix border-radius on unescape button
silverwind Nov 16, 2021
06b4146
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 18, 2021
b93d0bf
drop buttons as per silverwind
zeripath Nov 18, 2021
cf04f2e
as per fnetx
zeripath Nov 18, 2021
aa4fc5a
hide the unescape button in the wiki
zeripath Nov 18, 2021
62f557d
add warning triangles to view and blame
zeripath Nov 18, 2021
b6ba958
Add warning triangles to diff
zeripath Nov 18, 2021
ea7a04a
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 20, 2021
36dd4bf
Merge branch 'main' into fix-17514-add-warning-bidi-characters
zeripath Nov 21, 2021
19aed47
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 29, 2021
6a2e274
ensure buttons work on loaded diffs
zeripath Nov 29, 2021
0d6e8f6
move escape functions into their own files
zeripath Nov 29, 2021
cb7d19d
extract out functions
zeripath Nov 29, 2021
c55394d
Apply suggestions from code review
zeripath Nov 29, 2021
ae19a60
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Dec 1, 2021
c11bd34
Update options/locale/locale_en-US.ini
zeripath Dec 8, 2021
58a4fcc
move warning triangle to another column
zeripath Dec 8, 2021
3f6057e
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Dec 8, 2021
67d00b5
Merge branch 'main' into fix-17514-add-warning-bidi-characters
6543 Jan 4, 2022
51a1bf1
Merge branch 'main' into fix-17514-add-warning-bidi-characters
6543 Jan 5, 2022
d8ab670
Merge branch 'master' into fix-17514-add-warning-bidi-characters
6543 Jan 6, 2022
0fc5af7
linter ignore bool "suspicious assignment to a by-value method receiv…
6543 Jan 6, 2022
1dc8a21
fix lint
wxiaoguang Jan 6, 2022
6f99bfd
refactoring
wxiaoguang Jan 6, 2022
ab6db78
refactor
wxiaoguang Jan 6, 2022
4e1b449
Apply suggestions from code review
zeripath Jan 6, 2022
aac0e1d
Merge pull request #10 from wxiaoguang/fix-17514-add-warning-bidi-cha…
zeripath Jan 6, 2022
f66923f
Merge branch 'main' into fix-17514-add-warning-bidi-characters
6543 Jan 6, 2022
a28bbbc
Merge branch 'main' into fix-17514-add-warning-bidi-characters
wxiaoguang Jan 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
230 changes: 230 additions & 0 deletions modules/charset/escape.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
"bytes"
"fmt"
"io"
"strings"
"unicode"
"unicode/utf8"

"golang.org/x/text/unicode/bidi"
)

// EscapeStatus represents the findings of the unicode escaper
type EscapeStatus struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be simplified to a uint32

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think flagging this would be a good idea.

Escaped bool
HasError bool
HasBadRunes bool
HasControls bool
HasSpaces bool
HasMarks bool
HasBIDI bool
BadBIDI bool
HasRTLScript bool
HasLTRScript bool
}

// Or combines two EscapeStatus structs into one representing the conjunction of the two
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
st := status
st.Escaped = st.Escaped || other.Escaped
st.HasError = st.HasError || other.HasError
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
st.HasControls = st.HasControls || other.HasControls
st.HasSpaces = st.HasSpaces || other.HasSpaces
st.HasMarks = st.HasMarks || other.HasMarks
st.HasBIDI = st.HasBIDI || other.HasBIDI
st.BadBIDI = st.BadBIDI || other.BadBIDI
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
return st
}

// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
func EscapeControlString(text string) (EscapeStatus, string) {
sb := &strings.Builder{}
escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
return escaped, sb.String()
}

// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
buf := &bytes.Buffer{}
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
return escaped, buf.Bytes()
}

// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
buf := make([]byte, 4096)
readStart := 0
var n int
var writePos int

lineHasBIDI := false
lineHasRTLScript := false
lineHasLTRScript := false

readingloop:
for err == nil {
n, err = text.Read(buf[readStart:])
bs := buf[:n+readStart]
i := 0

for i < len(bs) {
r, size := utf8.DecodeRune(bs[i:])
// Now handle the codepoints
switch {
case r == utf8.RuneError:
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
writePos = i
}
// runes can be at most 4 bytes - so...
if len(bs)-i <= 3 {
// if not request more data
copy(buf, bs[i:])
readStart = n - i
writePos = 0
continue readingloop
}
// this is a real broken rune
escaped.HasBadRunes = true
escaped.Escaped = true
if err = writeBroken(output, bs[i:i+size]); err != nil {
escaped.HasError = true
return
}
writePos += size
case r == '\n':
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
escaped.BadBIDI = true
}
lineHasBIDI = false
lineHasRTLScript = false
lineHasLTRScript = false

case r == '\r' || r == '\t' || r == ' ':
// These are acceptable control characters and space characters
case unicode.IsSpace(r):
escaped.HasSpaces = true
escaped.Escaped = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.Bidi_Control, r):
escaped.Escaped = true
escaped.HasBIDI = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
lineHasBIDI = true
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.C, r):
escaped.Escaped = true
escaped.HasControls = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.M, r):
escaped.Escaped = true
escaped.HasMarks = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
default:
p, _ := bidi.Lookup(bs[i : i+size])
c := p.Class()
if c == bidi.R || c == bidi.AL {
lineHasRTLScript = true
escaped.HasRTLScript = true
} else if c == bidi.L {
lineHasLTRScript = true
escaped.HasLTRScript = true
}
}
i += size
}
if n > 0 {
// we read something...
// write everything unwritten
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}

// reset the starting positions for the next read
readStart = 0
writePos = 0
}
}
if readStart > 0 {
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
escaped.Escaped = true
escaped.HasBadRunes = true
if err = writeBroken(output, buf[:readStart]); err != nil {
escaped.HasError = true
return
}
}
if err == io.EOF {
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
escaped.BadBIDI = true
}
err = nil
return
}
escaped.HasError = true
return
}

func writeBroken(output io.Writer, bs []byte) (err error) {
_, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs)
return
}

func writeEscaped(output io.Writer, r rune) (err error) {
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
return
}