/
langset.go
118 lines (105 loc) · 2.72 KB
/
langset.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package fontscan
import (
"encoding/binary"
"errors"
"strings"
"github.com/go-text/typesetting/language"
)
// LangID is a compact representation of a language
// this package has orthographic knowledge of.
type LangID uint16
// NewLangID returns the compact index of the given language,
// or false if it is not supported by this package.
//
// Derived languages not exactly supported are mapped to their primary part : for instance,
// 'fr-be' is mapped to 'fr'
func NewLangID(l language.Language) (LangID, bool) {
const N = len(languagesRunes)
// binary search
i, j := 0, N
for i < j {
h := i + (j-i)/2
entry := languagesRunes[h]
if l < entry.lang {
j = h
} else if entry.lang < l {
i = h + 1
} else {
// extact match
return LangID(h), true
}
}
// i is the index where l should be :
// try to match the primary part
root := l.Primary()
for ; i >= 0; i-- {
entry := languagesRunes[i]
if entry.lang > root { // keep going
continue
} else if entry.lang < root {
// no root match
return 0, false
} else { // found the root
return LangID(i), true
}
}
return 0, false
}
// langSet is a bit set for 512 languages
//
// It works as a map[LangID]bool, with the limitation
// that only the 9 low bits of a LangID are used.
// More precisely, the page of a LangID l is given by its 3 "higher" bits : 8-6
// and the bit position by its 6 lower bits : 5-0
type langSet [8]uint64
// newLangsetFromCoverage compile the languages supported by the given
// rune coverage
func newLangsetFromCoverage(rs runeSet) (out langSet) {
for id, item := range languagesRunes {
if rs.includes(item.runes) {
out.add(LangID(id))
}
}
return out
}
func (ls langSet) String() string {
var chunks []string
for pageN, page := range ls {
for bit := 0; bit < 64; bit++ {
if page&(1<<bit) != 0 {
id := pageN<<6 | bit
chunks = append(chunks, string(languagesRunes[id].lang))
}
}
}
return "{" + strings.Join(chunks, "|") + "}"
}
func (ls *langSet) add(l LangID) {
page := (l & 0b111111111 >> 6)
bit := l & 0b111111
ls[page] |= 1 << bit
}
func (ls langSet) contains(l LangID) bool {
page := (l & 0b111111111 >> 6)
bit := l & 0b111111
return ls[page]&(1<<bit) != 0
}
const langSetSize = 8 * 8
func (ls langSet) serialize() []byte {
var buffer [langSetSize]byte
for i, v := range ls {
binary.BigEndian.PutUint64(buffer[i*8:], v)
}
return buffer[:]
}
// deserializeFrom reads the binary format produced by serializeTo
// it returns the number of bytes read from `data`
func (ls *langSet) deserializeFrom(data []byte) (int, error) {
if len(data) < langSetSize {
return 0, errors.New("invalid lang set (EOF)")
}
for i := range ls {
ls[i] = binary.BigEndian.Uint64(data[i*8:])
}
return langSetSize, nil
}