/
features.go
151 lines (129 loc) · 3.82 KB
/
features.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package token
import (
"strings"
"unicode"
"github.com/gnames/gnfinder/io/dict"
)
// Features keep properties of a token as a possible candidate for a
// name part.
type Features struct {
// IsCapitalized is true if the first rune that is letter, is capitalized.
IsCapitalized bool
// HasDash is true if token tontains dash
HasDash bool
// HasStartParens is true if token start with '('
HasStartParens bool
// HasEndParens is true if token ends with ')'
HasEndParens bool
// Abbr feature: token ends with a period.
Abbr bool
// PotentialBinomialGenus feature: the token might be a genus of name.
PotentialBinomialGenus bool
// StartsWithLetter feature: the token has necessary qualities to be a start
// of a binomial species. It assumes to be low-case and be two letters or
// more.
StartsWithLetter bool
// EndsWithLetter feature: the token has necessary quality to be a species
// part of trinomial.
EndsWithLetter bool
// RankLike is true if token is a known infraspecific rank
RankLike bool
// UninomialDict defines which Genera or Uninomials dictionary (if any)
// contained the token.
UninomialDict dict.DictionaryType
// SpeciesDict defines which Species dictionary (if any) contained the token.
SpeciesDict dict.DictionaryType
// GenSpGreyDict shows how many specific/infraspecific epithets of a putative
// name matched bi-/tri- nomials in a full name dictionary for grey genera.
// For example "Bubo bubo" name would set it to 1, and "Bubo bubo bubo" would
// set it to 2.
GenSpGreyDict int
}
func (p *Features) setAbbr(raw []rune, start, end int) {
var abbr bool
l := len(raw)
lenClean := end - start + 1
if lenClean < 4 && l > 1 && unicode.IsLetter(raw[l-2]) &&
raw[l-1] == rune('.') {
abbr = true
}
p.Abbr = abbr
}
func (p *Features) setPotentialBinomialGenus(
raw []rune,
start, end int,
) {
// Assumes a precondition that the first letter is capitalized.
lenRaw := len(raw)
lenClean := end - start + 1
cleanEnd := lenRaw == end+1
switch lenClean {
case 0:
p.PotentialBinomialGenus = false
case 1:
p.PotentialBinomialGenus = p.Abbr
case 2, 3:
p.PotentialBinomialGenus = p.Abbr || cleanEnd
default:
p.PotentialBinomialGenus = cleanEnd
}
}
func (p *Features) setStartsWithLetter(start, end int) {
lenClean := end - start + 1
if lenClean >= 2 && start == 0 {
p.StartsWithLetter = true
}
}
func (p *Features) setEndsWithLetter(raw []rune, start, end int) {
cleanEnd := len(raw) == end+1
p.EndsWithLetter = cleanEnd
}
func (p *Features) SetUninomialDict(cleaned string, d *dict.Dictionary) {
if p.UninomialDict != dict.NotSet {
return
}
in := func(dict map[string]struct{}) bool { _, ok := dict[cleaned]; return ok }
inlow := func(dict map[string]struct{}) bool {
_, ok := dict[strings.ToLower(cleaned)]
return ok
}
switch {
case in(d.WhiteGenera):
p.UninomialDict = dict.WhiteGenus
case in(d.GreyGenera):
p.UninomialDict = dict.GreyGenus
case in(d.WhiteUninomials):
p.UninomialDict = dict.WhiteUninomial
case in(d.GreyUninomials):
p.UninomialDict = dict.GreyUninomial
case inlow(d.BlackUninomials):
p.UninomialDict = dict.BlackUninomial
case inlow(d.CommonWords):
p.UninomialDict = dict.CommonWords
default:
p.UninomialDict = dict.NotInDictionary
}
}
func (p *Features) SetSpeciesDict(cleaned string, d *dict.Dictionary) {
if p.SpeciesDict != dict.NotSet {
return
}
in := func(dict map[string]struct{}) bool { _, ok := dict[cleaned]; return ok }
switch {
case in(d.WhiteSpecies):
p.SpeciesDict = dict.WhiteSpecies
case in(d.GreySpecies):
p.SpeciesDict = dict.GreySpecies
case in(d.BlackSpecies):
p.SpeciesDict = dict.BlackSpecies
case in(d.CommonWords):
p.SpeciesDict = dict.CommonWords
default:
p.SpeciesDict = dict.NotInDictionary
}
}
func (p *Features) SetRank(raw string, d *dict.Dictionary) {
if _, ok := d.Ranks[raw]; ok {
p.RankLike = true
}
}