/
features.go
149 lines (134 loc) · 3.97 KB
/
features.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package token
import (
"strings"
"unicode"
"github.com/gnames/gnfinder/dict"
)
// Features keep properties of a token as a possible candidate for a name part.
type Features struct {
// Candidate to be a start of a uninomial or binomial.
NameStartCandidate bool
// The name looks like a possible genus name.
PotentialBinomialGenus bool
// The token has necessary qualities to be a start of a binomial.
StartsWithLetter bool
// The token has necessary quality to be a species part of trinomial.
EndsWithLetter bool
// Capitalized feature of the first alphabetic character.
Capitalized bool
// CapitalizedSpecies -- the first species lphabetic character is capitalized.
CapitalizedSpecies bool
// HasDash -- information if '-' character is part of the word
HasDash bool
// ParensEnd feature: token starts with parentheses.
ParensStart bool
// ParensEnd feature: token ends with parentheses.
ParensEnd bool
// ParensEndSpecies feature: species token ends with parentheses.
ParensEndSpecies bool
// Abbr feature: token ends with a period.
Abbr bool
// RankLike is true if token is a known infraspecific rank
RankLike bool
// UninomialDict defines which Genera or Uninomials dictionary (if any)
// contained the token.
UninomialDict dict.DictionaryType
// SpeciesDict defines which Species dictionary (if any) contained the token.
SpeciesDict dict.DictionaryType
}
func (t *Token) setParensStart(firstRune rune) {
t.ParensStart = firstRune == rune('(')
}
func (t *Token) setParensEnd(lastRune rune) {
t.ParensEnd = lastRune == rune(')')
}
func (t *Token) setHasDash() {
t.HasDash = true
}
func (t *Token) setCapitalized(firstAlphabetRune rune) {
t.Capitalized = unicode.IsUpper(firstAlphabetRune)
}
func (t *Token) setAbbr(raw []rune, startEnd *[2]int) {
l := len(raw)
lenClean := startEnd[1] - startEnd[0] + 1
if lenClean < 4 && l > 1 && unicode.IsLetter(raw[l-2]) &&
raw[l-1] == rune('.') {
t.Abbr = true
}
}
func (t *Token) setPotentialBinomialGenus(startEnd *[2]int, raw []rune) {
lenRaw := len(raw)
lenClean := startEnd[1] - startEnd[0] + 1
cleanEnd := lenRaw == startEnd[1]+1
switch lenClean {
case 0:
t.PotentialBinomialGenus = false
case 1:
t.PotentialBinomialGenus = t.Abbr
case 2, 3:
t.PotentialBinomialGenus = t.Abbr || cleanEnd
default:
t.PotentialBinomialGenus = cleanEnd
}
}
func (t *Token) setStartsWithLetter(startEnd *[2]int) {
lenClean := startEnd[1] - startEnd[0] + 1
if lenClean >= 2 && startEnd[0] == 0 {
t.StartsWithLetter = true
}
}
func (t *Token) setEndsWithLetter(startEnd *[2]int, raw []rune) {
cleanEnd := len(raw) == startEnd[1]+1
t.EndsWithLetter = cleanEnd
}
func (t *Token) SetUninomialDict(d *dict.Dictionary) {
if t.UninomialDict != dict.NotSet {
return
}
name := t.Cleaned
in := func(dict map[string]struct{}) bool { _, ok := dict[name]; return ok }
inlow := func(dict map[string]struct{}) bool {
_, ok := dict[strings.ToLower(name)]
return ok
}
switch {
case in(d.WhiteGenera):
t.UninomialDict = dict.WhiteGenus
case in(d.GreyGenera):
t.UninomialDict = dict.GreyGenus
case in(d.WhiteUninomials):
t.UninomialDict = dict.WhiteUninomial
case in(d.GreyUninomials):
t.UninomialDict = dict.GreyUninomial
case inlow(d.BlackUninomials):
t.UninomialDict = dict.BlackUninomial
case inlow(d.CommonWords):
t.UninomialDict = dict.CommonWords
default:
t.UninomialDict = dict.NotInDictionary
}
}
func (t *Token) SetSpeciesDict(d *dict.Dictionary) {
if t.SpeciesDict != dict.NotSet {
return
}
name := strings.ToLower(t.Cleaned)
in := func(dict map[string]struct{}) bool { _, ok := dict[name]; return ok }
switch {
case in(d.WhiteSpecies):
t.SpeciesDict = dict.WhiteSpecies
case in(d.GreySpecies):
t.SpeciesDict = dict.GreySpecies
case in(d.BlackSpecies):
t.SpeciesDict = dict.BlackSpecies
case in(d.CommonWords):
t.SpeciesDict = dict.CommonWords
default:
t.SpeciesDict = dict.NotInDictionary
}
}
func (t *Token) SetRank(d *dict.Dictionary) {
if _, ok := d.Ranks[string(t.Raw)]; ok {
t.RankLike = true
}
}