forked from RadhiFadlillah/go-bayesian
-
Notifications
You must be signed in to change notification settings - Fork 1
/
classifier.go
182 lines (146 loc) · 4.4 KB
/
classifier.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
package bayesian
import (
"encoding/gob"
"math"
"os"
"io"
)
const (
// MultinomialTf is model where frequency of token affects posterior probability
MultinomialTf Model = 1
// MultinomialBoolean is model like TF, but each token only calculated once for each document
MultinomialBoolean Model = 2
)
// Model is alias of int, representing Naive-Bayes model that used in classifier
type Model int
// Class is alias of string, representing class of a document
type Class string
// Document is a group of tokens with certain class
type Document struct {
Class Class
Tokens []string
}
// NewDocument return new Document
func NewDocument(class Class, tokens ...string) Document {
return Document{
Class: class,
Tokens: tokens,
}
}
// Classifier is object for classifying document
type Classifier struct {
Model Model
LearningResults map[string]map[Class]int
PriorProbabilities map[Class]float64
NDocumentByClass map[Class]int
NFrequencyByClass map[Class]int
NAllDocument int
}
// NewClassifier returns new Classifier
func NewClassifier(model Model) Classifier {
return Classifier{
Model: model,
LearningResults: make(map[string]map[Class]int),
PriorProbabilities: make(map[Class]float64),
NDocumentByClass: make(map[Class]int),
NFrequencyByClass: make(map[Class]int),
}
}
// NewClassifierFromFile returns new Classifier with configuration loaded from file in path
func NewClassifierFromFile(path string) (Classifier, error) {
classifier := Classifier{}
fl, err := os.Open(path)
if err != nil {
return classifier, err
}
defer fl.Close()
err = gob.NewDecoder(fl).Decode(&classifier)
if err != nil {
return classifier, err
}
return classifier, err
}
// NewClassifierFromFile returns new Classifier with configuration loaded from a byte stream in file
func NewClassifierFromFileStream(fl io.Reader) (Classifier, error) {
classifier := Classifier{}
err := gob.NewDecoder(fl).Decode(&classifier)
if err != nil {
return classifier, err
}
return classifier, err
}
// Learn executes learning process for this classifier
func (classifier *Classifier) Learn(docs ...Document) {
classifier.NAllDocument += len(docs)
for _, doc := range docs {
classifier.NDocumentByClass[doc.Class]++
tokens := doc.Tokens
if classifier.Model == MultinomialBoolean {
tokens = classifier.removeDuplicate(doc.Tokens...)
}
for _, token := range tokens {
classifier.NFrequencyByClass[doc.Class]++
if _, exist := classifier.LearningResults[token]; !exist {
classifier.LearningResults[token] = make(map[Class]int)
}
classifier.LearningResults[token][doc.Class]++
}
}
for class, nDocument := range classifier.NDocumentByClass {
classifier.PriorProbabilities[class] = math.Log(float64(nDocument) / float64(classifier.NAllDocument))
}
}
// Classify executes classifying process for tokens
func (classifier Classifier) Classify(tokens ...string) (map[Class]float64, Class, bool) {
nVocabulary := len(classifier.LearningResults)
posteriorProbabilities := make(map[Class]float64)
for class, priorProb := range classifier.PriorProbabilities {
posteriorProbabilities[class] = priorProb
}
if classifier.Model == MultinomialBoolean {
tokens = classifier.removeDuplicate(tokens...)
}
for class, freqByClass := range classifier.NFrequencyByClass {
for _, token := range tokens {
nToken := classifier.LearningResults[token][class]
posteriorProbabilities[class] += math.Log(float64(nToken+1) / float64(freqByClass+nVocabulary))
}
}
var certain bool
var bestClass Class
var highestProb float64
for class, prob := range posteriorProbabilities {
if highestProb == 0 || prob > highestProb {
certain = true
bestClass = class
highestProb = prob
} else if prob == highestProb {
certain = false
}
}
return posteriorProbabilities, bestClass, certain
}
// SaveClassifierToFile saves Classifier config to file in path
func (classifier Classifier) SaveClassifierToFile(path string) error {
fl, err := os.Create(path)
if err != nil {
return err
}
defer fl.Close()
err = gob.NewEncoder(fl).Encode(&classifier)
if err != nil {
return err
}
return nil
}
func (classifier *Classifier) removeDuplicate(tokens ...string) []string {
mapTokens := make(map[string]struct{})
newTokens := []string{}
for _, token := range tokens {
mapTokens[token] = struct{}{}
}
for key := range mapTokens {
newTokens = append(newTokens, key)
}
return newTokens
}