Skip to content

Commit

Permalink
publicsuffix: embed table data
Browse files Browse the repository at this point in the history
Use //go:embed to embed the public suffix tables,
rather than generating .go files containing the data.

Creating an empty git repo and generating commits for the
last 20 updates to the public suffix list, the total size
of the repository directory as measured by "du -sh" decreases
from 2.2M to 668K when using embedding.

For golang/go#15518.

Change-Id: Id71759765831a7699e7a182937095b3820bb643b
Reviewed-on: https://go-review.googlesource.com/c/net/+/450935
Run-TryBot: Damien Neil <dneil@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <nigeltao@google.com>
  • Loading branch information
neild committed Nov 16, 2022
1 parent ecf091a commit 0833b63
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 11,018 deletions.
Binary file added publicsuffix/data/children
Binary file not shown.
Binary file added publicsuffix/data/nodes
Binary file not shown.
1 change: 1 addition & 0 deletions publicsuffix/data/text

Large diffs are not rendered by default.

200 changes: 97 additions & 103 deletions publicsuffix/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ package main
import (
"bufio"
"bytes"
"encoding/binary"
"flag"
"fmt"
"go/format"
Expand Down Expand Up @@ -55,6 +56,7 @@ const (
)

var (
combinedText string
maxChildren int
maxTextOffset int
maxTextLength int
Expand Down Expand Up @@ -115,11 +117,10 @@ var (
shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)

comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
v = flag.Bool("v", false, "verbose output (to stderr)")
version = flag.String("version", "", "the effective_tld_names.dat version")
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
v = flag.Bool("v", false, "verbose output (to stderr)")
version = flag.String("version", "", "the effective_tld_names.dat version")
)

func main() {
Expand Down Expand Up @@ -254,7 +255,33 @@ func main1() error {
}
sort.Strings(labelsList)

if err := generate(printReal, &root, "table.go"); err != nil {
combinedText = combineText(labelsList)
if combinedText == "" {
return fmt.Errorf("internal error: combineText returned no text")
}
for _, label := range labelsList {
offset, length := strings.Index(combinedText, label), len(label)
if offset < 0 {
return fmt.Errorf("internal error: could not find %q in text %q", label, combinedText)
}
maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
if offset >= 1<<nodesBitsTextOffset {
return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
}
if length >= 1<<nodesBitsTextLength {
return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
}
labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
}

if err := root.walk(assignIndexes); err != nil {
return err
}

if err := generate(printMetadata, &root, "table.go"); err != nil {
return err
}
if err := generateBinaryData(&root, combinedText); err != nil {
return err
}
if err := generate(printTest, &root, "table_test.go"); err != nil {
Expand Down Expand Up @@ -307,18 +334,63 @@ func printTest(w io.Writer, n *node) error {
fmt.Fprintf(w, "%q,\n", rule)
}
fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
if err := n.walk(w, printNodeLabel); err != nil {
if err := n.walk(func(n *node) error {
return printNodeLabel(w, n)
}); err != nil {
return err
}
fmt.Fprintf(w, "}\n")
return nil
}

func printReal(w io.Writer, n *node) error {
func generateBinaryData(root *node, combinedText string) error {
if err := os.WriteFile("data/text", []byte(combinedText), 0666); err != nil {
return err
}

var nodes []byte
if err := root.walk(func(n *node) error {
for _, c := range n.children {
nodes = appendNodeEncoding(nodes, c)
}
return nil
}); err != nil {
return err
}
if err := os.WriteFile("data/nodes", nodes, 0666); err != nil {
return err
}

var children []byte
for _, c := range childrenEncoding {
children = binary.BigEndian.AppendUint32(children, c)
}
if err := os.WriteFile("data/children", children, 0666); err != nil {
return err
}

return nil
}

func appendNodeEncoding(b []byte, n *node) []byte {
encoding := labelEncoding[n.label]
if n.icann {
encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
}
encoding |= uint64(n.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
for i := nodesBits - 8; i >= 0; i -= 8 {
b = append(b, byte((encoding>>i)&0xff))
}
return b
}

func printMetadata(w io.Writer, n *node) error {
const header = `// generated by go run gen.go; DO NOT EDIT
package publicsuffix
import _ "embed"
const version = %q
const (
Expand All @@ -343,74 +415,36 @@ const (
// numTLD is the number of top level domains.
const numTLD = %d
// text is the combined text of all labels.
//
//go:embed data/text
var text string
`
fmt.Fprintf(w, header, *version,
nodesBits,
nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))

text := combineText(labelsList)
if text == "" {
return fmt.Errorf("internal error: makeText returned no text")
}
for _, label := range labelsList {
offset, length := strings.Index(text, label), len(label)
if offset < 0 {
return fmt.Errorf("internal error: could not find %q in text %q", label, text)
}
maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
if offset >= 1<<nodesBitsTextOffset {
return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
}
if length >= 1<<nodesBitsTextLength {
return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
}
labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
}
fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
for len(text) > 0 {
n, plus := len(text), ""
if n > 64 {
n, plus = 64, " +"
}
fmt.Fprintf(w, "%q%s\n", text[:n], plus)
text = text[n:]
}

if err := n.walk(w, assignIndexes); err != nil {
return err
}

fmt.Fprintf(w, `
// nodes is the list of nodes. Each node is represented as a %v-bit integer,
// which encodes the node's children, wildcard bit and node type (as an index
// into the children array), ICANN bit and text.
//
// If the table was generated with the -comments flag, there is a //-comment
// after each node's data. In it is the nodes-array indexes of the children,
// formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
// nodeType is printed as + for normal, ! for exception, and o for parent-only
// nodes that have children but don't match a domain label in their own right.
// An I denotes an ICANN domain.
//
// The layout within the node, from MSB to LSB, is:
// [%2d bits] unused
// [%2d bits] children index
// [%2d bits] ICANN bit
// [%2d bits] text index
// [%2d bits] text length
var nodes = [...]uint8{
//
//go:embed data/nodes
var nodes uint40String
`,
nodesBits,
nodesBits-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
if err := n.walk(w, printNode); err != nil {
return err
}
fmt.Fprintf(w, `}
fmt.Fprintf(w, `
// children is the list of nodes' children, the parent's wildcard bit and the
// parent's node type. If a node has no children then their children index
// will be in the range [0, 6), depending on the wildcard bit and node type.
Expand All @@ -421,27 +455,13 @@ var nodes = [...]uint8{
// [%2d bits] node type
// [%2d bits] high nodes index (exclusive) of children
// [%2d bits] low nodes index (inclusive) of children
var children=[...]uint32{
//
//go:embed data/children
var children uint32String
`,
32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
for i, c := range childrenEncoding {
s := "---------------"
lo := c & (1<<childrenBitsLo - 1)
hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
if lo != hi {
s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
}
nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
if *comments {
fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
} else {
fmt.Fprintf(w, "0x%x,\n", c)
}
}
fmt.Fprintf(w, "}\n\n")

fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
Expand All @@ -465,12 +485,12 @@ type node struct {
children []*node
}

func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
if err := f(w, n); err != nil {
func (n *node) walk(f func(*node) error) error {
if err := f(n); err != nil {
return err
}
for _, c := range n.children {
if err := c.walk(w, f); err != nil {
if err := c.walk(f); err != nil {
return err
}
}
Expand Down Expand Up @@ -516,7 +536,7 @@ var childrenEncoding = []uint32{

var firstCallToAssignIndexes = true

func assignIndexes(w io.Writer, n *node) error {
func assignIndexes(n *node) error {
if len(n.children) != 0 {
// Assign nodesIndex.
n.firstChild = nextNodesIndex
Expand Down Expand Up @@ -561,32 +581,6 @@ func assignIndexes(w io.Writer, n *node) error {
return nil
}

func printNode(w io.Writer, n *node) error {
for _, c := range n.children {
s := "---------------"
if len(c.children) != 0 {
s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
}
encoding := labelEncoding[c.label]
if c.icann {
encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
}
encoding |= uint64(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
for i := nodesBits - 8; i >= 0; i -= 8 {
fmt.Fprintf(w, "0x%02x, ", (encoding>>i)&0xff)
}
if *comments {
fmt.Fprintf(w, "// n0x%04x c0x%04x (%s)%s %s %s %s\n",
c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
)
} else {
fmt.Fprintf(w, "\n")
}
}
return nil
}

func printNodeLabel(w io.Writer, n *node) error {
for _, c := range n.children {
fmt.Fprintf(w, "%q,\n", c.label)
Expand Down
36 changes: 24 additions & 12 deletions publicsuffix/list.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ loop:
break
}

u := uint32(nodeValue(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
u := uint32(nodes.get(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
icannNode = u&(1<<nodesBitsICANN-1) != 0
u >>= nodesBitsICANN
u = children[u&(1<<nodesBitsChildren-1)]
u = children.get(u & (1<<nodesBitsChildren - 1))
lo = u & (1<<childrenBitsLo - 1)
u >>= childrenBitsLo
hi = u & (1<<childrenBitsHi - 1)
Expand Down Expand Up @@ -154,18 +154,9 @@ func find(label string, lo, hi uint32) uint32 {
return notFound
}

func nodeValue(i uint32) uint64 {
off := uint64(i * (nodesBits / 8))
return uint64(nodes[off])<<32 |
uint64(nodes[off+1])<<24 |
uint64(nodes[off+2])<<16 |
uint64(nodes[off+3])<<8 |
uint64(nodes[off+4])
}

// nodeLabel returns the label for the i'th node.
func nodeLabel(i uint32) string {
x := nodeValue(i)
x := nodes.get(i)
length := x & (1<<nodesBitsTextLength - 1)
x >>= nodesBitsTextLength
offset := x & (1<<nodesBitsTextOffset - 1)
Expand All @@ -189,3 +180,24 @@ func EffectiveTLDPlusOne(domain string) (string, error) {
}
return domain[1+strings.LastIndex(domain[:i], "."):], nil
}

type uint32String string

func (u uint32String) get(i uint32) uint32 {
off := i * 4
return (uint32(u[off])<<24 |
uint32(u[off+1])<<16 |
uint32(u[off+2])<<8 |
uint32(u[off+3]))
}

type uint40String string

func (u uint40String) get(i uint32) uint64 {
off := uint64(i * (nodesBits / 8))
return uint64(u[off])<<32 |
uint64(u[off+1])<<24 |
uint64(u[off+2])<<16 |
uint64(u[off+3])<<8 |
uint64(u[off+4])
}
Loading

0 comments on commit 0833b63

Please sign in to comment.