Permalink
Browse files

transform/urlreplacers: Support unquoted URLs in canonifyURLs replacer

Fixes #5529
  • Loading branch information...
bep committed Dec 17, 2018
1 parent d5a0b6b commit efe0b4e5c0292c1e5e27b0c32fbc368062fde3e8
Showing with 120 additions and 94 deletions.
  1. +104 −93 transform/urlreplacers/absurlreplacer.go
  2. +16 −1 transform/urlreplacers/absurlreplacer_test.go
@@ -16,6 +16,7 @@ package urlreplacers
import (
"bytes"
"io"
"unicode"
"unicode/utf8"

"github.com/gohugoio/hugo/transform"
@@ -43,7 +44,7 @@ type absurllexer struct {
start int // item start position
width int // width of last element

matchers []absURLMatcher
quotes [][]byte

ms matchState
matches [3]bool // track matches of the 3 prefixes
@@ -140,84 +141,115 @@ func (l *absurllexer) emit() {
l.start = l.pos
}

// handle URLs in src and href.
func checkCandidateBase(l *absurllexer) {
for _, m := range l.matchers {
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
var (
relURLPrefix = []byte("/")
relURLPrefixLen = len(relURLPrefix)
)

func (l *absurllexer) consumeQuote() []byte {
for _, q := range l.quotes {
if bytes.HasPrefix(l.content[l.pos:], q) {
l.pos += len(q)
l.emit()
return q
}
l.pos += len(m.match)
l.w.Write(m.quote)
l.w.Write(l.path)
l.start = l.pos
}
return nil
}

// handle URLs in src and href.
func checkCandidateBase(l *absurllexer) {
l.consumeQuote()

if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
return
}

// check for schemaless URLs
posAfter := l.pos + relURLPrefixLen
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += relURLPrefixLen
l.w.Write(l.path)
l.start = l.pos
}

func (l *absurllexer) posAfterURL(q []byte) int {
if len(q) > 0 {
// look for end quote
return bytes.Index(l.content[l.pos:], q)
}

return bytes.IndexFunc(l.content[l.pos:], func(r rune) bool {
return r == '>' || unicode.IsSpace(r)
})

}

// handle URLs in srcset.
func checkCandidateSrcset(l *absurllexer) {
// special case, not frequent (me think)
for _, m := range l.matchers {
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
q := l.consumeQuote()
if q == nil {
// srcset needs to be quoted.
return
}

// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
continue
}
// special case, not frequent (me think)
if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
return
}

posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
// check for schemaless URLs
posAfter := l.pos + relURLPrefixLen
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}

// safe guard
if posLastQuote < 0 || posLastQuote > 2000 {
return
}
posEnd := l.posAfterURL(q)

if l.pos > l.start {
l.emit()
}
// safe guard
if posEnd < 0 || posEnd > 2000 {
return
}

section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
if l.pos > l.start {
l.emit()
}

fields := bytes.Fields(section)
l.w.Write(m.quote)
for i, f := range fields {
if f[0] == '/' {
l.w.Write(l.path)
l.w.Write(f[1:])
section := l.content[l.pos : l.pos+posEnd+1]

} else {
l.w.Write(f)
}
fields := bytes.Fields(section)
for i, f := range fields {
if f[0] == '/' {
l.w.Write(l.path)
l.w.Write(f[1:])

if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
} else {
l.w.Write(f)
}

l.w.Write(m.quote)
l.pos += len(section) + (len(m.quote) * 2)
l.start = l.pos
if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
}

l.pos += len(section)
l.start = l.pos

}

// main loop
@@ -262,53 +294,32 @@ func (l *absurllexer) replace() {
}
}

func doReplace(path string, ct transform.FromTo, matchers []absURLMatcher) {
func doReplace(path string, ct transform.FromTo, quotes [][]byte) {

lexer := &absurllexer{
content: ct.From().Bytes(),
w: ct.To(),
path: []byte(path),
matchers: matchers}
content: ct.From().Bytes(),
w: ct.To(),
path: []byte(path),
quotes: quotes}

lexer.replace()
}

type absURLReplacer struct {
htmlMatchers []absURLMatcher
xmlMatchers []absURLMatcher
htmlQuotes [][]byte
xmlQuotes [][]byte
}

func newAbsURLReplacer() *absURLReplacer {

// HTML
dqHTMLMatch := []byte("\"/")
sqHTMLMatch := []byte("'/")

// XML
dqXMLMatch := []byte("&#34;/")
sqXMLMatch := []byte("&#39;/")

dqHTML := []byte("\"")
sqHTML := []byte("'")

dqXML := []byte("&#34;")
sqXML := []byte("&#39;")

return &absURLReplacer{
htmlMatchers: []absURLMatcher{
{dqHTMLMatch, dqHTML},
{sqHTMLMatch, sqHTML},
},
xmlMatchers: []absURLMatcher{
{dqXMLMatch, dqXML},
{sqXMLMatch, sqXML},
}}
htmlQuotes: [][]byte{[]byte("\""), []byte("'")},
xmlQuotes: [][]byte{[]byte("&#34;"), []byte("&#39;")}}
}

func (au *absURLReplacer) replaceInHTML(path string, ct transform.FromTo) {
doReplace(path, ct, au.htmlMatchers)
doReplace(path, ct, au.htmlQuotes)
}

func (au *absURLReplacer) replaceInXML(path string, ct transform.FromTo) {
doReplace(path, ct, au.xmlMatchers)
doReplace(path, ct, au.xmlQuotes)
}
@@ -156,6 +156,21 @@ func TestAbsURL(t *testing.T) {

}

func TestAbsURLUnqoted(t *testing.T) {
tr := transform.New(NewAbsURLTransformer(testBaseURL))

apply(t.Errorf, tr, []test{
test{
content: `Link: <a href=/asdf>ASDF</a>`,
expected: `Link: <a href=http://base/asdf>ASDF</a>`,
},
test{
content: `Link: <a href=/asdf >ASDF</a>`,
expected: `Link: <a href=http://base/asdf >ASDF</a>`,
},
})
}

func TestRelativeURL(t *testing.T) {
tr := transform.New(NewAbsURLTransformer(helpers.GetDottedRelativePath(filepath.FromSlash("/post/sub/"))))

@@ -176,7 +191,7 @@ func TestAbsXMLURLSrcSet(t *testing.T) {
}

func BenchmarkXMLAbsURL(b *testing.B) {
tr := transform.New(NewAbsURLInXMLTransformer(""))
tr := transform.New(NewAbsURLInXMLTransformer(testBaseURL))

b.ResetTimer()
for i := 0; i < b.N; i++ {

0 comments on commit efe0b4e

Please sign in to comment.