forked from ryanbressler/CloudForest
-
Notifications
You must be signed in to change notification settings - Fork 1
/
tree.go
230 lines (194 loc) · 6.41 KB
/
tree.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
package CloudForest
import ()
//Tree represents a single decision tree.
type Tree struct {
//Tree int
Root *Node
Target string
Weight float64
}
func NewTree() *Tree {
return &Tree{new(Node), "", -1.0}
}
//AddNode adds a node a the specified path with the specified pred value and/or
//Splitter. Paths are specified in the same format as in rf-aces sf files, as a
//string of 'L' and 'R'. Nodes must be added from the root up as the case where
//the path specifies a node whose parent does not already exist in the tree is
//not handled well.
func (t *Tree) AddNode(path string, pred string, splitter *Splitter) {
n := new(Node)
n.Pred = pred
n.Splitter = splitter
if t.Root == nil {
t.Root = n
} else {
loc := t.Root
for i := 0; i < len(path); i++ {
switch path[i : i+1] {
case "L":
if loc.Left == nil {
loc.Left = n
}
loc = loc.Left
case "R":
if loc.Right == nil {
loc.Right = n
}
loc = loc.Right
case "M":
if loc.Missing == nil {
loc.Missing = n
}
loc = loc.Missing
}
}
}
}
/*
tree.Grow grows the receiver tree through recursion. It uses impurity decrease to select splitters at
each node as in Brieman's Random Forest. It should be called on a tree with only a root node defined.
fm is a feature matrix of training data.
target is the feature to predict via regression or classification as determined by feature type.
cases specifies the cases to calculate impurity decrease over and can contain repeated values
to allow for sampling of cases with replacement as in RF.
mTry specifies the number of candidate features to evaluate for each split.
leafSize specifies the minimum number of cases at a leafNode.
*/
func (t *Tree) Grow(fm *FeatureMatrix,
target Target,
cases []int,
candidates []int,
mTry int,
leafSize int,
splitmissing bool,
importance *[]*RunningMean,
allocs *BestSplitAllocs) {
t.Root.Recurse(func(n *Node, innercases []int, depth int) {
if (2 * leafSize) <= len(innercases) {
SampleFirstN(&candidates, mTry)
best, impDec := fm.BestSplitter(target, innercases, candidates[:mTry], allocs)
if best != nil && impDec > minImp {
if importance != nil {
(*importance)[fm.Map[best.Feature]].Add(impDec)
}
//not a leaf node so define the splitter and left and right nodes
//so recursion will continue
n.Splitter = best
n.Pred = ""
n.Left = new(Node)
n.Right = new(Node)
if splitmissing {
n.Missing = new(Node)
}
return
}
}
//Leaf node so find the predictive value and set it in n.Pred
n.Splitter = nil
n.Pred = target.FindPredicted(innercases)
}, fm, cases, 0)
}
//GetSplits returns the arrays of all Numeric splitters of a tree.
func (t *Tree) GetSplits(fm *FeatureMatrix, fbycase *SparseCounter, relativeSplitCount *SparseCounter) []Splitter {
splitters := make([]Splitter, 0)
ncases := len(fm.Data[0].Missing) // grab the number of samples for the first feature
cases := make([]int, ncases) //make an array that large
for i, _ := range cases {
cases[i] = i
}
t.Root.Recurse(func(n *Node, cases []int, depth int) {
//if we're on a splitting node
if fbycase != nil && n.Splitter != nil && n.Splitter.Numerical == true {
//add this splitter to the list
splitters = append(splitters, Splitter{n.Splitter.Feature,
n.Splitter.Numerical,
n.Splitter.Value,
n.Splitter.Left})
f_id := n.Splitter.Feature //get the feature at this splitter
f := fm.Data[fm.Map[n.Splitter.Feature]] //get the feature at this splitter
for _, c := range cases { //for each case
if f.Missing[c] == false { //if there isa value for this case
fbycase.Add(c, fm.Map[f_id], 1) //count the number of times each case is present for a split by a feature
switch f.NumData[c] <= n.Splitter.Value {
case true: //if the value was split to the left
relativeSplitCount.Add(c, fm.Map[f_id], -1) //subtract one
case false:
relativeSplitCount.Add(c, fm.Map[f_id], 1) //add one
}
}
}
}
}, fm, cases, 0)
return splitters //return the array of all splitters from the tree
}
//GetLeaves is called by the leaf count utility to
//gather statistics about the nodes of a tree including the sets of cases at
//"leaf" nodes that aren't split further and the number of times each feature
//is used to split away each case.
func (t *Tree) GetLeaves(fm *FeatureMatrix, fbycase *SparseCounter) []Leaf {
leaves := make([]Leaf, 0)
ncases := len(fm.Data[0].Missing)
cases := make([]int, 0, ncases)
for i := 0; i < ncases; i++ {
cases = append(cases, i)
}
t.Root.Recurse(func(n *Node, cases []int, depth int) {
if n.Left == nil && n.Right == nil { // I'm in a leaf node
leaves = append(leaves, Leaf{cases, n.Pred})
}
if fbycase != nil && n.Splitter != nil { //I'm not in a leaf node?
for _, c := range cases {
fbycase.Add(c, fm.Map[n.Splitter.Feature], 1)
}
}
}, fm, cases, 0)
return leaves
}
func (t *Tree) Partition(fm *FeatureMatrix) *[][]int {
leaves := make([][]int, 0)
ncases := len(fm.Data[0].Missing)
cases := make([]int, 0, ncases)
for i := 0; i < ncases; i++ {
cases = append(cases, i)
}
t.Root.Recurse(func(n *Node, cases []int, depth int) {
if n.Left == nil && n.Right == nil { // I'm in a leaf node
leaves = append(leaves, cases)
}
}, fm, cases, 0)
return &leaves
}
//Leaf is a struct for storing the index of the cases at a terminal "Leaf" node
//along with the Numeric predicted value.
type Leaf struct {
Cases []int
Pred string
}
//Tree.Vote casts a vote for the predicted value of each case in fm *FeatureMatrix.
//into bb *BallotBox. Since BallotBox is not thread safe trees should not vote
//into the same BallotBox in parallel.
func (t *Tree) Vote(fm *FeatureMatrix, bb VoteTallyer) {
ncases := len(fm.Data[0].Missing)
cases := make([]int, 0, ncases)
for i := 0; i < ncases; i++ {
cases = append(cases, i)
}
t.VoteCases(fm, bb, cases)
}
//Tree.VoteCases casts a vote for the predicted value of each case in fm *FeatureMatrix.
//into bb *BallotBox. Since BallotBox is not thread safe trees should not vote
//into the same BallotBox in parallel.
func (t *Tree) VoteCases(fm *FeatureMatrix, bb VoteTallyer, cases []int) {
weight := 1.0
if t.Weight >= 0.0 {
weight = t.Weight
}
t.Root.Recurse(func(n *Node, cases []int, depth int) {
if n.Left == nil && n.Right == nil {
// I'm in a leaf node
for i := 0; i < len(cases); i++ {
bb.Vote(cases[i], n.Pred, weight)
}
}
}, fm, cases, 0)
}