-
Notifications
You must be signed in to change notification settings - Fork 0
/
algokfold.go
240 lines (178 loc) · 7.03 KB
/
algokfold.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
package algorithms
import (
"errors"
"fmt"
"math"
"math/rand"
"sort"
"time"
"../diabetesdata"
"../logging"
"../support"
)
type kFoldMeasure struct {
Similarity float64
Index int
}
var (
kfoldFolds [][]int
numberOfFolds int // number of pots to divide into
kfoldSimilarityTable []kFoldMeasure
truePositiveCount,
trueNegativeCount,
falsePositiveCount,
falseNegativeCount int // default counts to zero
)
func resetTestCounters() {
truePositiveCount = 0
trueNegativeCount = 0
falsePositiveCount = 0
falseNegativeCount = 0
}
func splitDataSetIntoEvenFolds(dataset []diabetesdata.PimaDiabetesRecord, folds int) ([][]int, error) {
// fold must be positive integer
if folds == 0 {
return [][]int{}, errors.New("Invalid number of folds specified")
}
numberOfRecords := len(dataset)
recordsPerFold := numberOfRecords / folds
kfoldFolds = make([][]int, folds)
// divide the dataset into even sized folds
rand.Seed(time.Now().UTC().UnixNano())
for record := 0; record < numberOfRecords; record++ {
foundPot := false
for !foundPot {
// get a random pot to out it in
foldID := rand.Intn(folds)
if len(kfoldFolds[foldID]) <= recordsPerFold {
kfoldFolds[foldID] = append(kfoldFolds[foldID], record)
foundPot = true
}
}
}
return kfoldFolds, nil
}
func calculateKFoldMetrics(dataset []diabetesdata.PimaDiabetesRecord, foldIndex int, actualOutcomeValue int) {
if len(kfoldSimilarityTable) == 0 { // sanity checking
return
}
// most similar record from training set will now be element zero.
numberOfNearestNeighbours := support.GetNumberOfNeighbours() // N
closestRecordsIndices := make([]int, numberOfNearestNeighbours) // set of closest matches
for neighbourIndex := 0; neighbourIndex < numberOfNearestNeighbours; neighbourIndex++ {
closestRecordsIndices[neighbourIndex] = kfoldSimilarityTable[neighbourIndex].Index
}
// have we sufficient positive nearest neighbours to reach the threshold
falseCount := 0
trueCount := 0
predictedOutcomeValue := actualOutcomeValue // healthy
// get concensus from N nearest records
for neighbourIndex := 0; neighbourIndex < numberOfNearestNeighbours; neighbourIndex++ {
if dataset[closestRecordsIndices[neighbourIndex]].TestedPositive == 1 {
trueCount++
} else {
falseCount++
}
}
// if sufficient then flip count
if trueCount > falseCount { // concensus
predictedOutcomeValue = 1
} else if falseCount > trueCount {
predictedOutcomeValue = 0
}
//TP
if predictedOutcomeValue == 1 && actualOutcomeValue == 1 {
truePositiveCount++
}
//FP
if predictedOutcomeValue == 1 && actualOutcomeValue == 0 {
falsePositiveCount++
}
//TN
if predictedOutcomeValue == 0 && actualOutcomeValue == 0 {
trueNegativeCount++
}
//FN
if predictedOutcomeValue == 0 && actualOutcomeValue == 1 {
falseNegativeCount++
}
Metrics[foldIndex].TruePositiveCount = truePositiveCount
Metrics[foldIndex].FalsePositiveCount = falsePositiveCount
Metrics[foldIndex].TrueNegativeCount = trueNegativeCount
Metrics[foldIndex].FalseNegativeCount = falseNegativeCount
}
func DoKFoldSplit(dataset []diabetesdata.PimaDiabetesRecord, numberOfFolds int) ([]diabetesdata.PimaDiabetesRecord, error) {
var expectedValue int
str := fmt.Sprintf("Number of folds : %d\n", numberOfFolds)
logging.DoWriteString(str, true, true)
splitDataset, err := splitDataSetIntoEvenFolds(dataset, numberOfFolds)
if err != nil {
return []diabetesdata.PimaDiabetesRecord{}, err
}
similarityTotals := make([]float64, numberOfFolds)
similarityAverages := make([]float64, numberOfFolds)
// Need to get metrics for each test fold
for testIndex := 0; testIndex < numberOfFolds; testIndex++ {
resetTestCounters() // reset all counters for this fold
for trainingIndex := 0; trainingIndex < numberOfFolds; trainingIndex++ {
if testIndex != trainingIndex { //positive matrix diagonal is ignored
// iterate through folds and apply each pair of of index as vectors
// [a b c d e] x [f g h i j]
similarityTotals[testIndex] = 0.0
similarityAverages[testIndex] = 0.0
for indexTestFold := 0; indexTestFold < len(splitDataset[testIndex]); indexTestFold++ {
var index int
var sim float64
expectedValue = dataset[(testIndex*numberOfFolds)+indexTestFold].TestedPositive
for indexTrainingFold := 0; indexTrainingFold < len(splitDataset[trainingIndex]); indexTrainingFold++ {
rec1 := dataset[splitDataset[testIndex][indexTestFold]]
rec2 := dataset[splitDataset[trainingIndex][indexTrainingFold]]
vector1 := anonymiseDiabetesRecord(rec1) // test vector
vector2 := anonymiseDiabetesRecord(rec2) // training vector
// accomodate if fold is short
elementsToCompare := math.Min(float64(len(vector1)), float64(len(vector2)))
similarity := support.CosineSimilarity(vector1, vector2, int(elementsToCompare))
similarityTotals[testIndex] += similarity
// add it to the kfold table
sim = similarity
index = splitDataset[trainingIndex][indexTrainingFold]
var newRecord kFoldMeasure
newRecord.Index = index
newRecord.Similarity = sim
// limit table size to just the number of records we need
maxNecessaryTableSize := support.GetNumberOfNeighbours()
if len(kfoldSimilarityTable) == maxNecessaryTableSize {
if newRecord.Similarity > kfoldSimilarityTable[maxNecessaryTableSize-1].Similarity {
kfoldSimilarityTable[maxNecessaryTableSize-1].Index = newRecord.Index
kfoldSimilarityTable[maxNecessaryTableSize-1].Similarity = newRecord.Similarity
}
} else {
kfoldSimilarityTable = append(kfoldSimilarityTable, newRecord)
}
// sort by cosine measure to get most similar at the lowest index for all test folds
sort.Slice(kfoldSimilarityTable[:], func(i, j int) bool {
return kfoldSimilarityTable[i].Similarity > kfoldSimilarityTable[j].Similarity
})
}
//get metrics for this test fold
calculateKFoldMetrics(dataset, testIndex, expectedValue) // get TP, FP, TN, FN etc for test index
kfoldSimilarityTable = kfoldSimilarityTable[:0]
vectorsCompared := len(splitDataset[testIndex]) * len(splitDataset[trainingIndex])
similarityAverages[testIndex] = similarityTotals[testIndex] / float64(vectorsCompared)
}
resetTestCounters()
}
}
// Dump the similarity average for the current fold
str = fmt.Sprintf("Test Fold Index %02d - Mean Similarity: %0.2f%%\n", testIndex+1, 100.0*similarityAverages[testIndex])
logging.DoWriteString(str, true, true)
}
// Summary section
overallConsistency := 0.0
for batchIndex := 0; batchIndex < numberOfFolds; batchIndex++ {
overallConsistency += similarityAverages[batchIndex]
}
overallConsistency = overallConsistency / float64(numberOfFolds)
fmt.Printf("\nOverall Average Similarity = %0.2f%%\n", 100.0*overallConsistency)
return dataset, nil
}