-
Notifications
You must be signed in to change notification settings - Fork 0
/
algomodal.go
162 lines (132 loc) · 4.8 KB
/
algomodal.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
package algorithms
import (
"fmt"
"sort"
"../diabetesdata"
"../logging"
"../support"
)
type valueCount struct {
Value float64
Count int
}
// just checks if value already exists in the list for this feature
func valueExistsForFeature(list []valueCount, value float64) (bool, int) {
for i := 0; i < len(list); i++ {
if list[i].Value == value {
return true, i
}
}
return false, -1
}
//algo=3
func replaceMissingValuesWithModal(dataset []diabetesdata.PimaDiabetesRecord) ([]diabetesdata.PimaDiabetesRecord, error) {
numberOfFields := support.SizeOfPimaDiabetesRecord() - 1
numberOfRecords := len(dataset)
var resultSet = make([]diabetesdata.PimaDiabetesRecord, numberOfRecords)
columnCount := make([][]valueCount, numberOfFields)
columnModal := make([]valueCount, numberOfFields)
for index := 0; index < numberOfRecords; index++ {
r := dataset[index]
// dont include incomplete records
if !support.IsIncompleteRecord(r) {
// only complete records considered from here onwards
var v valueCount
var pos int
var exists bool
var value float64
for field := 0; field < numberOfFields; field++ {
switch field {
case 0:
value = r.NumberOfTimesPregnant
case 1:
value = r.DiastolicBloodPressure
case 2:
value = r.PlasmaGlucoseConcentration
case 3:
value = r.TricepsSkinfoldThickness
case 4:
value = r.SeriumInsulin
case 5:
value = r.BodyMassIndex
case 6:
value = r.DiabetesPedigreeFunction
case 7:
value = r.Age
}
exists, pos = valueExistsForFeature(columnCount[field], value)
if !exists {
v.Count = 1
v.Value = value
columnCount[field] = append(columnCount[field], v)
} else {
columnCount[field][pos].Count++
}
}
}
}
// done all the counts. need to find modal value for each column
for field := 0; field < numberOfFields; field++ {
sort.Slice(columnCount[field][:],
func(i, j int) bool {
return columnCount[field][i].Count > columnCount[field][j].Count
})
// select first non missing value for mode
if columnCount[field][0].Value == 0 { // can used a gap as modal value
columnModal[field].Value = columnCount[field][1].Value
} else {
columnModal[field].Value = columnCount[field][0].Value
}
}
// Dump all the column modal values
for index := 0; index < numberOfFields; index++ {
str := fmt.Sprintf("Modal (%s) = %0.2f\n", textNameforColumn(index), columnModal[index].Value)
logging.DoWriteString(str, true, true)
}
// now we have the modal for each columm run through and process the data set
for index := 0; index < numberOfRecords; index++ {
if dataset[index].NumberOfTimesPregnant == 0 {
resultSet[index].NumberOfTimesPregnant = support.RoundFloat64(columnModal[0].Value, 2)
} else {
resultSet[index].NumberOfTimesPregnant = support.RoundFloat64(dataset[index].NumberOfTimesPregnant, 2)
}
if dataset[index].PlasmaGlucoseConcentration == 0 {
resultSet[index].PlasmaGlucoseConcentration = support.RoundFloat64(columnModal[1].Value, 2)
} else {
resultSet[index].PlasmaGlucoseConcentration = support.RoundFloat64(dataset[index].PlasmaGlucoseConcentration, 2)
}
if dataset[index].DiastolicBloodPressure == 0 {
resultSet[index].DiastolicBloodPressure = support.RoundFloat64(columnModal[2].Value, 2)
} else {
resultSet[index].DiastolicBloodPressure = support.RoundFloat64(dataset[index].DiastolicBloodPressure, 2)
}
if dataset[index].TricepsSkinfoldThickness == 0 {
resultSet[index].TricepsSkinfoldThickness = support.RoundFloat64(columnModal[3].Value, 2)
} else {
resultSet[index].TricepsSkinfoldThickness = support.RoundFloat64(dataset[index].TricepsSkinfoldThickness, 2)
}
if dataset[index].SeriumInsulin == 0 {
resultSet[index].SeriumInsulin = support.RoundFloat64(columnModal[4].Value, 2)
} else {
resultSet[index].SeriumInsulin = support.RoundFloat64(dataset[index].SeriumInsulin, 2)
}
if dataset[index].BodyMassIndex == 0 {
resultSet[index].BodyMassIndex = support.RoundFloat64(columnModal[5].Value, 2)
} else {
resultSet[index].BodyMassIndex = support.RoundFloat64(dataset[index].BodyMassIndex, 2)
}
if dataset[index].DiabetesPedigreeFunction == 0 {
resultSet[index].DiabetesPedigreeFunction = support.RoundFloat64(columnModal[6].Value, 2)
} else {
resultSet[index].DiabetesPedigreeFunction = support.RoundFloat64(dataset[index].DiabetesPedigreeFunction, 2)
}
if dataset[index].Age == 0 {
resultSet[index].Age = columnModal[7].Value
} else {
resultSet[index].Age = dataset[index].Age
}
// TestedPositive field may actually be zero
resultSet[index].TestedPositive = dataset[index].TestedPositive
}
return resultSet, nil
}