Skip to content

Commit

Permalink
parser/pageparser: Add front matter etc. support
Browse files Browse the repository at this point in the history
See #5324
  • Loading branch information
bep committed Oct 22, 2018
1 parent f6863e1 commit 2fdc4a2
Show file tree
Hide file tree
Showing 4 changed files with 344 additions and 67 deletions.
18 changes: 13 additions & 5 deletions parser/pageparser/item.go
Expand Up @@ -73,10 +73,10 @@ func (i Item) String() string {
return i.Val return i.Val
case i.typ > tKeywordMarker: case i.typ > tKeywordMarker:
return fmt.Sprintf("<%s>", i.Val) return fmt.Sprintf("<%s>", i.Val)
case len(i.Val) > 20: case len(i.Val) > 50:
return fmt.Sprintf("%.20q...", i.Val) return fmt.Sprintf("%v:%.20q...", i.typ, i.Val)
} }
return fmt.Sprintf("[%s]", i.Val) return fmt.Sprintf("%v:[%s]", i.typ, i.Val)
} }


type itemType int type itemType int
Expand All @@ -85,6 +85,15 @@ const (
tError itemType = iota tError itemType = iota
tEOF tEOF


// page items
tHTMLLead // <
tSummaryDivider // <!--more-->
tSummaryDividerOrg // # more
tFrontMatterYAML
tFrontMatterTOML
tFrontMatterJSON
tFrontMatterORG

// shortcode items // shortcode items
tLeftDelimScNoMarkup tLeftDelimScNoMarkup
tRightDelimScNoMarkup tRightDelimScNoMarkup
Expand All @@ -95,8 +104,7 @@ const (
tScParam tScParam
tScParamVal tScParamVal


//itemIdentifier tText // plain text
tText // plain text, used for everything outside the shortcodes


// preserved for later - keywords come after this // preserved for later - keywords come after this
tKeywordMarker tKeywordMarker
Expand Down
246 changes: 224 additions & 22 deletions parser/pageparser/pagelexer.go
Expand Up @@ -44,13 +44,15 @@ type lexerShortcodeState struct {
} }


type pageLexer struct { type pageLexer struct {
name string input string
input string stateStart stateFunc
state stateFunc state stateFunc
pos pos // input position pos pos // input position
start pos // item start position start pos // item start position
width pos // width of last element width pos // width of last element
lastPos pos // position of the last item returned by nextItem lastPos pos // position of the last item returned by nextItem

contentSections int


lexerShortcodeState lexerShortcodeState


Expand All @@ -63,18 +65,18 @@ func Parse(s string) *Tokens {
} }


func ParseFrom(s string, from int) *Tokens { func ParseFrom(s string, from int) *Tokens {
lexer := newPageLexer("default", s, pos(from)) lexer := newPageLexer(s, pos(from), lexMainSection) // TODO(bep) 2errors
lexer.run() lexer.run()
return &Tokens{lexer: lexer} return &Tokens{lexer: lexer}
} }


// note: the input position here is normally 0 (start), but // note: the input position here is normally 0 (start), but
// can be set if position of first shortcode is known // can be set if position of first shortcode is known
func newPageLexer(name, input string, inputPosition pos) *pageLexer { func newPageLexer(input string, inputPosition pos, stateStart stateFunc) *pageLexer {
lexer := &pageLexer{ lexer := &pageLexer{
name: name, input: input,
input: input, pos: inputPosition,
pos: inputPosition, stateStart: stateStart,
lexerShortcodeState: lexerShortcodeState{ lexerShortcodeState: lexerShortcodeState{
currLeftDelimItem: tLeftDelimScNoMarkup, currLeftDelimItem: tLeftDelimScNoMarkup,
currRightDelimItem: tRightDelimScNoMarkup, currRightDelimItem: tRightDelimScNoMarkup,
Expand All @@ -88,14 +90,13 @@ func newPageLexer(name, input string, inputPosition pos) *pageLexer {


// main loop // main loop
func (l *pageLexer) run() *pageLexer { func (l *pageLexer) run() *pageLexer {
for l.state = lexTextOutsideShortcodes; l.state != nil; { for l.state = l.stateStart; l.state != nil; {
l.state = l.state(l) l.state = l.state(l)
} }
return l return l
} }


// state functions // Shortcode syntax

const ( const (
leftDelimScNoMarkup = "{{<" leftDelimScNoMarkup = "{{<"
rightDelimScNoMarkup = ">}}" rightDelimScNoMarkup = ">}}"
Expand All @@ -105,6 +106,12 @@ const (
rightComment = "*/" rightComment = "*/"
) )


// Page syntax
const (
summaryDivider = "<!--more-->"
summaryDividerOrg = "# more"
)

func (l *pageLexer) next() rune { func (l *pageLexer) next() rune {
if int(l.pos) >= len(l.input) { if int(l.pos) >= len(l.input) {
l.width = 0 l.width = 0
Expand Down Expand Up @@ -178,11 +185,21 @@ func (l *pageLexer) nextItem() Item {
return item return item
} }


// scans until an opening shortcode opening bracket. func (l *pageLexer) consumeCRLF() bool {
// if no shortcodes, it will keep on scanning until EOF var consumed bool
func lexTextOutsideShortcodes(l *pageLexer) stateFunc { for _, r := range crLf {
if l.next() != r {
l.backup()
} else {
consumed = true
}
}
return consumed
}

func lexMainSection(l *pageLexer) stateFunc {
for { for {
if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) { if l.isShortCodeStart() {
if l.pos > l.start { if l.pos > l.start {
l.emit(tText) l.emit(tText)
} }
Expand All @@ -194,12 +211,79 @@ func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
l.currRightDelimItem = tRightDelimScNoMarkup l.currRightDelimItem = tRightDelimScNoMarkup
} }
return lexShortcodeLeftDelim return lexShortcodeLeftDelim
}


if l.contentSections <= 1 {
if strings.HasPrefix(l.input[l.pos:], summaryDivider) {
if l.pos > l.start {
l.emit(tText)
}
l.contentSections++
l.pos += pos(len(summaryDivider))
l.emit(tSummaryDivider)
} else if strings.HasPrefix(l.input[l.pos:], summaryDividerOrg) {
if l.pos > l.start {
l.emit(tText)
}
l.contentSections++
l.pos += pos(len(summaryDividerOrg))
l.emit(tSummaryDividerOrg)
}
} }
if l.next() == eof {
r := l.next()
if r == eof {
break break
} }

} }

return lexDone

}

func (l *pageLexer) isShortCodeStart() bool {
return strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup)
}

func lexIntroSection(l *pageLexer) stateFunc {
LOOP:
for {
r := l.next()
if r == eof {
break
}

switch {
case r == '+':
return l.lexFrontMatterSection(tFrontMatterTOML, r, "TOML", "+++")
case r == '-':
return l.lexFrontMatterSection(tFrontMatterYAML, r, "YAML", "---")
case r == '{':
return lexFrontMatterJSON
case r == '#':
return lexFrontMatterOrgMode
case !isSpace(r) && !isEndOfLine(r):
if r == '<' {
l.emit(tHTMLLead)
// Not need to look further. Hugo treats this as plain HTML,
// no front matter, no shortcodes, no nothing.
l.pos = pos(len(l.input))
l.emit(tText)
break LOOP
}
return l.errorf("failed to detect front matter type; got unknown identifier %q", r)
}
}

l.contentSections = 1

// Now move on to the shortcodes.
return lexMainSection
}

func lexDone(l *pageLexer) stateFunc {

// Done! // Done!
if l.pos > l.start { if l.pos > l.start {
l.emit(tText) l.emit(tText)
Expand All @@ -208,6 +292,122 @@ func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
return nil return nil
} }


func lexFrontMatterJSON(l *pageLexer) stateFunc {
// Include the left delimiter
l.backup()

var (
inQuote bool
level int
)

for {

r := l.next()

switch {
case r == eof:
return l.errorf("unexpected EOF parsing JSON front matter")
case r == '{':
if !inQuote {
level++
}
case r == '}':
if !inQuote {
level--
}
case r == '"':
inQuote = !inQuote
case r == '\\':
// This may be an escaped quote. Make sure it's not marked as a
// real one.
l.next()
}

if level == 0 {
break
}
}

l.consumeCRLF()
l.emit(tFrontMatterJSON)

return lexMainSection
}

func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
/*
#+TITLE: Test File For chaseadamsio/goorgeous
#+AUTHOR: Chase Adams
#+DESCRIPTION: Just another golang parser for org content!
*/

const prefix = "#+"

l.backup()

if !strings.HasPrefix(l.input[l.pos:], prefix) {
// TODO(bep) consider error
return lexMainSection
}

// Read lines until we no longer see a #+ prefix
LOOP:
for {

r := l.next()

switch {
case r == '\n':
if !strings.HasPrefix(l.input[l.pos:], prefix) {
break LOOP
}
case r == eof:
break LOOP

}
}

l.emit(tFrontMatterORG)

return lexMainSection

}

// Handle YAML or TOML front matter.
func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name, delim string) stateFunc {
for i := 0; i < 2; i++ {
if r := l.next(); r != delimr {
return l.errorf("invalid %s delimiter", name)
}
}

if !l.consumeCRLF() {
return l.errorf("invalid %s delimiter", name)
}

// We don't care about the delimiters.
l.ignore()

for {
r := l.next()
if r == eof {
return l.errorf("EOF looking for end %s front matter delimiter", name)
}
if isEndOfLine(r) {
if strings.HasPrefix(l.input[l.pos:], delim) {
l.emit(tp)
l.pos += 3
l.consumeCRLF()
l.ignore()
break
}
}
}

return lexMainSection
}

func lexShortcodeLeftDelim(l *pageLexer) stateFunc { func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
l.pos += pos(len(l.currentLeftShortcodeDelim())) l.pos += pos(len(l.currentLeftShortcodeDelim()))
if strings.HasPrefix(l.input[l.pos:], leftComment) { if strings.HasPrefix(l.input[l.pos:], leftComment) {
Expand All @@ -234,14 +434,14 @@ func lexShortcodeComment(l *pageLexer) stateFunc {
l.ignore() l.ignore()
l.pos += pos(len(l.currentRightShortcodeDelim())) l.pos += pos(len(l.currentRightShortcodeDelim()))
l.emit(tText) l.emit(tText)
return lexTextOutsideShortcodes return lexMainSection
} }


func lexShortcodeRightDelim(l *pageLexer) stateFunc { func lexShortcodeRightDelim(l *pageLexer) stateFunc {
l.closingState = 0 l.closingState = 0
l.pos += pos(len(l.currentRightShortcodeDelim())) l.pos += pos(len(l.currentRightShortcodeDelim()))
l.emit(l.currentRightShortcodeDelimItem()) l.emit(l.currentRightShortcodeDelimItem())
return lexTextOutsideShortcodes return lexMainSection
} }


// either: // either:
Expand Down Expand Up @@ -485,6 +685,8 @@ func isAlphaNumericOrHyphen(r rune) bool {
return isAlphaNumeric(r) || r == '-' return isAlphaNumeric(r) || r == '-'
} }


var crLf = []rune{'\r', '\n'}

func isEndOfLine(r rune) bool { func isEndOfLine(r rune) bool {
return r == '\r' || r == '\n' return r == '\r' || r == '\n'
} }
Expand Down

0 comments on commit 2fdc4a2

Please sign in to comment.