html/htmlutil/tokenizer.go

package htmlutil

import (
	"bytes"
	"errors"
	"io"
	"os"

	"github.com/grokify/mogo/type/stringsutil"
	"golang.org/x/net/html"
)

var (
	ErrTokenNotFound           = errors.New("token(s) not found")
	ErrTokenizerNotInitialized = errors.New("tokenizer not initialized")
)

func NewTokenizerBytes(b []byte) *html.Tokenizer {
	return html.NewTokenizer(bytes.NewReader(b))
}

func NewTokenizerFile(name string) (*html.Tokenizer, error) {
	b, err := os.ReadFile(name)
	if err != nil {
		return nil, err
	}
	return NewTokenizerBytes(b), nil
}

/*
// TokensBetweenAtom returns the tokens that represent the `innerHtml`
// between a start and end tag token.
func TokensBetweenAtom(z *html.Tokenizer, skipErrors, inclusive bool, htmlAtom atom.Atom) ([]html.Token, error) {
	return TokensBetween(z, skipErrors, inclusive,
		TokenFilters{{
			TokenType: html.StartTagToken,
			AtomSet:   NewAtomSet(htmlAtom)}},
		TokenFilters{{
			TokenType: html.EndTagToken,
			AtomSet:   NewAtomSet(htmlAtom)}})
}
*/

/*
func TokensBetween(z *html.Tokenizer, skipErrors, inclusive bool, begin, end TokenFilters) ([]html.Token, error) {
	tokens := []html.Token{}
	tmsBegin, err := NextTokenMatch(z, skipErrors, false, false, begin...)
	if err != nil {
		return tokens, err
	}
	if inclusive {
		tokens = append(tokens, tmsBegin...)
	}
	tokensChain, err := NextTokenMatch(z, skipErrors, true, inclusive, end...)
	if err != nil {
		return tokens, err
	}
	tokens = append(tokens, tokensChain...)
	return tokens, nil
}
*/

/*

func TokensBetweenNew(z *html.Tokenizer, skipErrors, inclusive bool, begin, end []html.Token) ([]html.Token, error) {
	begFilters := Tokens(begin)
	endFilter := Tokens(end)

	tokens := []html.Token{}
	tmsBegin, err := NextToken(z, skipErrors, begin...)
	if err != nil {
		return tokens, err
	}
	if inclusive {
		tokens = append(tokens, tmsBegin...)
	}
	tokensChain, err := NextTokenMatch(z, skipErrors, true, inclusive, end...)
	if err != nil {
		return tokens, err
	}
	tokens = append(tokens, tokensChain...)
	return tokens, nil
}
*/

func NextToken(z *html.Tokenizer, skipErrors bool, tokFilters ...html.Token) (html.Token, error) {
	opts := NextTokensOpts{
		SkipErrors:     skipErrors,
		IncludeChain:   false,
		InclusiveMatch: true,
		StartFilter:    []html.Token{},
		EndFilter:      tokFilters,
	}
	toks, err := NextTokens(z, opts)
	if err != nil {
		return html.Token{}, err
	} else if len(toks) == 0 {
		return html.Token{}, ErrTokenNotFound
	} else if len(toks) > 1 {
		panic("too many tokens (>1) found")
	}
	return toks[0], nil
}

type NextTokensOpts struct {
	SkipErrors               bool
	IncludeChain             bool
	InclusiveMatch           bool
	StartFilter              Tokens
	StartAttributeValueMatch *stringsutil.MatchInfo
	EndFilter                Tokens
}

func NextTokens(z *html.Tokenizer, opts NextTokensOpts) (Tokens, error) {
	// func NextTokens(z *html.Tokenizer, skipErrors, includeChain, includeMatch bool, start, end []html.Token) ([]html.Token, error) {
	matches := []html.Token{}
	if z == nil {
		return matches, ErrTokenizerNotInitialized
	}
	if opts.StartAttributeValueMatch == nil {
		opts.StartAttributeValueMatch = &stringsutil.MatchInfo{
			MatchType: stringsutil.MatchExact,
		}
	}
	foundStartToken := false
	if len(opts.StartFilter) == 0 {
		foundStartToken = true
	}

	for {
		ttThis := z.Next()
		switch ttThis {
		case html.ErrorToken:
			err := z.Err()
			if err == io.EOF {
				return matches, nil
			} else if !opts.SkipErrors {
				return matches, err
			}
		default:
			tok := z.Token()
			if foundStartToken {
				if opts.EndFilter.MatchLeft(tok, nil) {
					if opts.InclusiveMatch {
						matches = append(matches, tok)
					}
					return matches, nil
				} else if opts.IncludeChain {
					matches = append(matches, tok)
				}
			} else {
				if opts.StartFilter.MatchLeft(tok, opts.StartAttributeValueMatch) {
					if opts.InclusiveMatch {
						matches = append(matches, tok)
					}
					foundStartToken = true
				}
			}
		}
	}
}

// NextTextToken uses `NextTokensOpts` specifically for `SkipErrors`, `StartFilter`, and `StartAttributeValueMatch`.
func NextTextToken(z *html.Tokenizer, opts NextTokensOpts) (html.Token, error) {
	if z == nil {
		return html.Token{}, ErrTokenizerNotInitialized
	}
	if opts.StartAttributeValueMatch == nil {
		opts.StartAttributeValueMatch = &stringsutil.MatchInfo{
			MatchType: stringsutil.MatchExact,
		}
	}
	foundStartToken := false
	for {
		tokType := z.Next()
		tok := z.Token()
		if tokType == html.ErrorToken {
			err := z.Err()
			if err == io.EOF {
				return tok, ErrTokenNotFound
			} else if !opts.SkipErrors {
				return tok, err
			}
		} else if tokType == html.TextToken &&
			len(opts.StartFilter) == 0 || foundStartToken {
			return tok, nil
		} else if opts.StartFilter.MatchLeft(tok, opts.StartAttributeValueMatch) {
			foundStartToken = true
		}
	}
}

/*
// NextTokenMatch returns a string of matches. `includeMatch` is only used when `includeChain` is included.
func NextTokenMatch(z *html.Tokenizer, skipErrors, includeChain, includeMatch bool, filters ...TokenFilter) ([]html.Token, error) {
	matches := []html.Token{}
	if len(filters) == 0 {
		return matches, errors.New("no filters provided")
	}
	filtersMore := TokenFilters(filters)
	for {
		tt := z.Next()
		token := z.Token()
		if token.Type == html.ErrorToken {
			break
		}
		filtersForType := filtersMore.ByTokenType(tt)
		if len(filtersForType) > 0 {
			for _, filter := range filtersForType {
				if filter.AtomSet.Len() == 0 {
					if !includeChain || includeMatch {
						matches = append(matches, token)
					}
					return matches, nil
				} else if filter.AtomSet.Exists(token.DataAtom) {
					if !includeChain || includeMatch {
						matches = append(matches, token)
					}
					return matches, nil
				}
			}
		}
		if includeChain {
			matches = append(matches, token)
		}
	}
	return matches, nil
}
*/

/*
func NextStartToken(z *html.Tokenizer, skipErrors bool, htmlAtoms ...atom.Atom) (html.Token, error) {
	if len(htmlAtoms) == 0 {
		return html.Token{}, errors.New("no atoms requested")
	}
	atoms := NewAtomSet(htmlAtoms...)
	for {
		ttThis := z.Next()
		switch ttThis {
		case html.ErrorToken:
			err := z.Err()
			if z.Err() == io.EOF {
				return html.Token{}, ErrTokenNotFound
			} else if !skipErrors {
				return html.Token{}, err
			}
		case html.StartTagToken:
			tok := z.Token()
			if atoms.Exists(tok.DataAtom) {
				return tok, nil
			}
		}
	}
}
*/

/*
func NextTextToken(z *html.Tokenizer, skipErrors bool, htmlAtoms ...atom.Atom) (html.Token, error) {
	atoms := NewAtomSet(htmlAtoms...)
	for {
		tokType := z.Next()
		tok := z.Token()
		if tokType == html.ErrorToken {
			err := z.Err()
			if err == io.EOF {
				return tok, ErrTokenNotFound
			} else if !skipErrors {
				return tok, err
			}
		} else if atoms.Len() == 0 && tokType == html.TextToken {
			return tok, nil
		} else if atoms.Len() > 0 &&
			tokType == html.StartTagToken &&
			atoms.Exists(tok.DataAtom) {
			return NextTextToken(z, skipErrors)
		}
	}
}
*/

/*
func NextTextToken(z *html.Tokenizer, skipErrors bool, start []html.Token) (html.Token, error) {
	_, err := NextToken(z, skipErrors, start...)
	if err != nil {
		return html.Token{}, err
	}
	return NextToken(z, skipErrors, html.Token{Type: html.TextToken})
}
*/