regexp.go

package util

import (
	"regexp"
	"sort"
	"strings"
	"unicode/utf8"
)

var level = 0

// func init() { log.SetFlags(0) }
//
// func debugf(f string, vs ...interface{}) {
// 	log.Printf(strings.Repeat("  ", level)+f, vs...)
// }

// ShortRegexpString tries to construct a short regexp that matches exactly the
// provided strings and nothing else.
//
// Warning: the current implementation may use a lot of time of memory.
func ShortRegexpString(vs ...string) (res string) {
	cache := make(map[string][]string)
	return render(shortRegexpString(vs, cache), false)
}

func shortRegexpString(vs []string, cache map[string][]string) (res []string) {
	// Canonicalize (might turn the input into one of the trivial cases below)
	if len(vs) > 1 {
		sort.Strings(vs)
		vs = removeDups(vs)
	}

	// Trivial cases.
	switch len(vs) {
	case 0:
		return nil
	case 1:
		return []string{regexp.QuoteMeta(vs[0])} // Nothing else to do.
	}

	// level++
	// defer func(s string) {
	// 	level--
	// 	debugf("ShortRegexpString(%s) = %#q", s, res)
	// }(fmt.Sprintf("%#q", vs))

	// The one to beat: just put ORs between them (after escaping meta-characters)
	best := make([]string, len(vs))
	for i := range vs {
		best[i] = regexp.QuoteMeta(vs[i])
	}

	cacheKey := render(best, false)
	bestCost := len(cacheKey)

	if cached, ok := cache[cacheKey]; ok {
		return cached
	}
	defer func(key string) {
		// Put clauses in a canonical order and cache them.
		sort.Strings(res)
		cache[key] = res
	}(cacheKey)

	recurse := func(prefix, suffix string, data commonSs) (result []string) {
		// debugf("> recurse(%#q, %#q, %v) on %#q", prefix, suffix, data, vs)
		// defer func() {
		// 	debugf("  recurse(%#q, %#q, %v) on %#q = %#q", prefix, suffix, data, vs, result)
		// }()

		//debugf("%v/%#q/%#q: %v\n", vs, prefix, suffix, data)
		varying := make([]string, data.end-data.start)
		allExist := true
		var preExistingIndices []int
		for i := data.start; i < data.end; i++ {
			substr := vs[i][len(prefix) : len(vs[i])-len(suffix)]
			varying[i-data.start] = substr
			if allExist {
				found := false
				for i := 0; i < len(vs); i++ {
					if i == data.start {
						i = data.end - 1
						continue
					}
					if substr == vs[i] {
						found = true
						preExistingIndices = append(preExistingIndices, i)
						break
					}
				}
				allExist = found
			}
		}

		var others []string
		// combined := make([]string, 0, len(preExistingIndices))
		if allExist && (prefix == "" || suffix == "") {
			others = make([]string, 0, len(vs)-2*len(preExistingIndices))
			sort.Ints(preExistingIndices)
			for i, k := 0, 0; i < len(vs) && k < len(preExistingIndices); i++ {
				if i == data.start {
					i = data.end - 1
					continue
				} else if i == preExistingIndices[k] {
					// combined = append(combined, vs[i])
					// debugf("Eliminating %#q", vs[i])
					k++
				} else {
					others = append(others, vs[i])
				}
			}
		} else {
			others = make([]string, len(vs)-(data.end-data.start))
			copy(others, vs[:data.start])
			copy(others[data.start:], vs[data.end:])
		}

		middle := render(shortRegexpString(varying, cache), true)
		// debugf(">> ShortRegexpString(%#q) = %#q", varying, middle)

		prefix, suffix = regexp.QuoteMeta(prefix), regexp.QuoteMeta(suffix)
		var cur string
		switch {
		case allExist && prefix == "": // M . S | M ==> M . S?
			cur = middle + optional(suffix)
		case allExist && suffix == "": // P . M | M ==> P? . M
			cur = optional(prefix) + middle
		default:
			cur = prefix + middle + suffix
		}
		return append([]string{cur}, shortRegexpString(others, cache)...)
	}

	// Note that vs is still sorted here.
	// debugf("Sorted: %#q", vs)
	for prefix, preLoc := range commonPrefixes(vs, 1) {
		suffix := sharedSuffix(len(prefix), vs[preLoc.start:preLoc.end])
		strs := recurse(prefix, suffix, preLoc)
		if c := cost(strs); c < bestCost { // || (c == len(best) && str < best) {
			best = strs
			bestCost = c
		} else {
			//debugf("! rejected %#q", str)
			//debugf("  because: %#q", best)
		}
	}

	sort.Sort(reverseStrings(vs))
	// debugf("Reverse-sorted: %#q", vs)
	for suffix, sufLoc := range commonSuffixes(vs, 1) {
		// sufLoc := suffixes[suffix]
		prefix := sharedPrefix(len(suffix), vs[sufLoc.start:sufLoc.end])
		strs := recurse(prefix, suffix, sufLoc)
		if c := cost(strs); c < bestCost { //|| (len(str) == len(best) && str < best) {
			best = strs
			bestCost = c
		} else {
			//debugf("! rejected %#q", str)
			//debugf("  because: %#q", best)
		}
	}

	singleChar := true
	optional := ""
	for i := range vs {
		if len(vs[i]) == 0 {
			optional = "?"
		} else if len(vs[i]) != 1 {
			// FIXME: should allow single non-ASCII characters
			singleChar = false
			break
		}
	}
	if singleChar {
		// Construct an array of characters in the right order:
		//   ']' first, '-' last, rest alphabetically
		class := make([]byte, 0, len(vs))
		last := ""
		for i, s := range vs {
			if s == "]" {
				// Must be first
				class = append(class, ']')
				vs[i] = "" // delete
			} else if s == "-" {
				// Must be last
				last = s
				vs[i] = "" // delete
			}
		}
		sortFirst := len(class)
		for _, s := range vs {
			class = append(class, s...)
		}
		sort.Sort(sortBytes(class[sortFirst:]))
		class = append(class, last...)

		// Collapse character ranges
		w := 0
		first := -1
		for i := 0; i < len(class); i++ {
			if first >= 0 {
				// Do we need to finish the range?
				if class[i] != class[i-1]+1 {
					// Does it pay to use a range?
					if i-first > 3 {
						// Build a range
						class[w-(i-first-1)] = '-'
						class[w-(i-first-1)+1] = class[i-1]
						// Rewind the write position
						w = w - (i - first - 1) + 2
						first = i
					}
				}
			} else {
				first = i
			}
			// Write the current character
			class[w] = class[i]
			w++
		}
		class = class[:w]

		if len(class) == 1 {
			str := regexp.QuoteMeta(string(class)) + optional
			if len(str) <= bestCost {
				best = []string{str}
				bestCost = len(str)
			}
		}
		if cost := len(class) + 2 + len(optional); cost <= bestCost {
			best = []string{"[" + string(class) + "]" + optional}
			bestCost = cost
		}
	}

	return best
}

func render(clauses []string, asSingle bool) string {
	switch len(clauses) {
	case 0:
		return "$.^" // Unmatchable?
	case 1:
		return clauses[0]
	default:
		if len(clauses[0]) == 0 {
			clauses = clauses[1:]
			if len(clauses) == 1 {
				return optional(clauses[0])
			}
			return render(clauses, true) + "?"
		}

		result := strings.Join(clauses, "|")
		if asSingle {
			result = "(" + result + ")"
		}
		return result
	}
}

func cost(clauses []string) int {
	// TODO: real implementation
	return len(render(clauses, false))
}

func optional(s string) string {
	if len(s) > 1 {
		s = "(" + s + ")?"
	} else if s != "" {
		s += "?"
	}
	return s
}

// removeDups removes duplicate strings from vs and returns it.
// It assumes that vs has been sorted such that duplicates are next to each
// other.
func removeDups(vs []string) []string {
	insertPos := 1
	for i := 1; i < len(vs); i++ {
		if vs[i-1] != vs[i] {
			vs[insertPos] = vs[i]
			insertPos++
		}
	}
	return vs[:insertPos]
}

func dup(vs []string) []string {
	result := make([]string, len(vs))
	copy(result, vs)
	return result
}

// reverseStrings is a sort.Interface that sort strings by their reverse values.
type reverseStrings []string

func (rs reverseStrings) Less(i, j int) bool {
	for m, n := len(rs[i])-1, len(rs[j])-1; m >= 0 && n >= 0; m, n = m-1, n-1 {
		if rs[i][m] != rs[j][n] {
			// We want to compare runes, not bytes. So find the start of the
			// current runes and decode them.
			for ; m > 0 && !utf8.RuneStart(rs[i][m]); m-- {
			}
			for ; n > 0 && !utf8.RuneStart(rs[j][n]); n-- {
			}
			ri, _ := utf8.DecodeRuneInString(rs[i][m:])
			rj, _ := utf8.DecodeRuneInString(rs[j][n:])
			return ri < rj
		}
	}
	return len(rs[i]) < len(rs[j])
}
func (rs reverseStrings) Swap(i, j int) { rs[i], rs[j] = rs[j], rs[i] }
func (rs reverseStrings) Len() int      { return len(rs) }

// sortBytes is a sort.Interface that sort bytes.
type sortBytes []byte

func (sb sortBytes) Less(i, j int) bool { return sb[i] < sb[j] }
func (sb sortBytes) Swap(i, j int)      { sb[i], sb[j] = sb[j], sb[i] }
func (sb sortBytes) Len() int           { return len(sb) }

// commonSs holds information on where to find a common substring.
type commonSs struct {
	start, end int
}

// commonPrefixes returns a map from prefixes to number of occurrences. Not all
// strings in vs need to have a prefix for it to be returned.
// Assumes vs to have been sorted with sort.Strings()
func commonPrefixes(vs []string, minLength int) (result map[string]commonSs) {
	result = make(map[string]commonSs)
	for i := 0; i < len(vs)-1; i++ {
		j := i + 1
		k := 0
		for ; k < len(vs[i]) && k < len(vs[j]); k++ {
			if vs[i][k] != vs[j][k] {
				break
			}
		}
		if k < minLength {
			continue
		}
		prefix := vs[i][:k]
		if _, exists := result[prefix]; !exists {
			first := prefixStart(vs[:i], prefix)
			//debugf("prefixStart(%#q, %#q) == %v", vs[:i], prefix, first)
			// prefixEnd(vs, prefix) - first + 1
			// == prefixEnd(vs[first:], prefix) + 1
			// == prefixEnd(vs[first+1:], prefix) + 2
			end := first + 1 + prefixEnd(vs[first+1:], prefix)
			result[prefix] = commonSs{
				first, end,
			}
			//debugf("prefixEnd(%#q, %#q) == %v", vs, prefix, result[prefix].end)
		}
	}
	// debugf("# %v..", result)
	return result
}

func prefixStart(vs []string, prefix string) int {
	if prefix == "" {
		return 0
	}
	return findFirst(vs, func(s string) bool {
		return strings.HasPrefix(s, prefix)
	})
}

func prefixEnd(vs []string, prefix string) int {
	if prefix == "" {
		return len(vs)
	}
	//debugf("prefixEnd(%v, %#q)", vs, prefix)
	return findFirst(vs, func(s string) bool {
		return !strings.HasPrefix(s, prefix)
	})
}

// commonSuffixes returns a map from suffixes to number of occurrences. Not all
// strings in vs need to have a suffix for it to be returned.
// Assumes vs to have been sorted using sort.Sort(reverseStrings(vs))
func commonSuffixes(vs []string, minLength int) (result map[string]commonSs) {
	result = make(map[string]commonSs)
	for i := 0; i < len(vs)-1; i++ {
		j := i + 1
		k := 0
		for ; k < len(vs[i]) && k < len(vs[j]); k++ {
			if vs[i][len(vs[i])-k-1] != vs[j][len(vs[j])-k-1] {
				break
			}
		}
		if k < minLength {
			continue
		}
		suffix := vs[i][len(vs[i])-k:]
		if _, exists := result[suffix]; !exists {
			first := suffixStart(vs[:i], suffix)
			//debugf("suffixStart<%#q>(%#q) == %v", suffix, vs[:i], first)
			// suffixEnd(vs, suffix) - first + 1
			// == suffixEnd(vs[first:], suffix) + 1
			// == suffixEnd(vs[first+1:], suffix) + 2
			end := first + 1 + suffixEnd(vs[first+1:], suffix)
			result[suffix] = commonSs{
				first, end,
			}
			//debugf("suffixEnd  <%#q>(%#q) == %v", suffix, vs, result[suffix].end)
			//debugf("selected(%#q): %q\n\n", suffix, vs[first:result[suffix].end])
		}
	}
	// debugf("# ..%v", result)
	return result
}

func suffixStart(vs []string, suffix string) int {
	// //debugf("suffixStart(%#q, %#q)", vs, suffix)
	if suffix == "" {
		return 0
	}
	return findFirst(vs, func(s string) bool {
		return strings.HasSuffix(s, suffix)
	})
}

func suffixEnd(vs []string, suffix string) int {
	// //debugf("suffixEnd  (%#q, %#q)", vs, suffix)
	if suffix == "" {
		return len(vs)
	}
	return findFirst(vs, func(s string) bool {
		return !strings.HasSuffix(s, suffix)
	})
}

// findFirst finds the first element of vs that satisfies the predicate.
// It assumes that the first N strings don't match the predicate, and the rest
// do. If all of the strings satisfy the predicate, it returns 0, and if none
// do it returns len(vs).
func findFirst(vs []string, predicate func(string) bool) int {
	l, h := -1, len(vs)
	// Invariant: vs[l] does not match, vs[h] does.
	// -1 and len(vs) are sentinal values, never tested but assumed to mismatch and match, respectively.
	for l+1 < h {
		m := (l + h) / 2 // Must now be a valid value
		// //debugf("%d %d %d", l, m, h)
		if predicate(vs[m]) {
			h = m
		} else {
			l = m
		}
	}
	//debugf("==> %d", h)
	return h
}

// sharedPrefix returns the longest prefix which all the parameters share but
// ignores a number of characters at the end of each string.
func sharedPrefix(ignore int, vs []string) (result string) {
	//debugf("sharedPrefix(%d, %#q)", ignore, vs)
	// defer func() {
	//debugf("==> %#q", result)
	// }()
	switch len(vs) {
	case 0:
		return ""
	case 1:
		return vs[0]
	}
	for i := 0; i < len(vs[0])-ignore; i++ {
		for n := 1; n < len(vs); n++ {
			if i >= len(vs[n])-ignore || vs[0][i] != vs[n][i] {
				return vs[0][:i]
			}
		}
	}
	return vs[0][:len(vs[0])-ignore]
}

// sharedSuffix returns the longest suffix which all the parameters share but
// ignores a number of characters at the start of each string.
func sharedSuffix(ignore int, vs []string) (result string) {
	//debugf("sharedSuffix(%d, %#q)", ignore, vs)
	// defer func() {
	//debugf("==> %#q", result)
	// }()
	switch len(vs) {
	case 0:
		return ""
	case 1:
		return vs[0]
	}
	first := vs[0]
	for i := 0; i < len(first)-ignore; i++ {
		for n := 1; n < len(vs); n++ {
			cur := vs[n]
			if i == len(cur)-ignore {
				return cur[ignore:]
			}
			if first[len(first)-i-1] != cur[len(cur)-i-1] {
				return first[len(first)-i:]
			}
		}
	}
	return first[ignore:]
}