/
cl-naive-bayes.lisp
116 lines (101 loc) · 3.62 KB
/
cl-naive-bayes.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
(in-package :cl-user)
(defpackage cl-naive-bayes
(:nicknames :nbayes)
(:use :cl)
(:import-from :anaphora
:aif
:sif
:slet
:it)
(:export :make-category-data
:make-learned-store))
(in-package :cl-naive-bayes)
(cl-annot:enable-annot-syntax)
@export
(defstruct category-data
(count 0)
(sum-word-count 0)
(word-count (make-hash-table :test #'equal)))
@export
(defstruct learned-store
(category-data-hash (make-hash-table :test #'equal))
(num-document 0)
(num-word-kind 0))
(defun count-word-in-category (store word category)
(aif (gethash category (learned-store-category-data-hash store))
(aif (gethash word (category-data-word-count it))
it
0)
0))
(defun count-category (store category)
(aif (gethash category (learned-store-category-data-hash store))
(category-data-sum-word-count it)
0))
(defun count-word-kind (store)
(learned-store-num-word-kind store))
; using Laplace Smoothing
(defun calc-logged-likelihood (store word-lst category)
(let ((denomi (max (+ (count-category store category)
(count-word-kind store))
1)))
(loop for word in word-lst
sum (log (/ (1+ (count-word-in-category store word category))
denomi)))))
(defun calc-logged-prior-prob (store category)
(with-slots (category-data-hash num-document) store
(log (/ (category-data-count (gethash category category-data-hash))
num-document))))
(defun sort-category-with-logged-prob (store word-lst)
(let (lst)
(maphash #'(lambda (category v)
(declare (ignore v))
(push (cons category
(+ (calc-logged-prior-prob store category)
(calc-logged-likelihood store word-lst category)))
lst))
(learned-store-category-data-hash store))
(sort lst #'> :key #'cdr)))
@export
(defun sort-category-by-prob (store word-lst)
(mapcar #'car (sort-category-with-logged-prob store word-lst)))
@export
(defun sort-category-with-post-prob (store word-lst)
(let ((sorted (sort-category-with-logged-prob store word-lst))
(sum-post-prob-no-norm 0)
(first-log nil))
(if (null sorted) (return-from sort-category-with-post-prob nil))
(setf first-log (cdar sorted))
; Ex. (log x, log y, log z) -> (0, log y/x, log z/y)
(dolist (elem sorted)
(let ((post-prob-no-norm (exp (- (cdr elem) first-log))))
(setf (cdr elem) post-prob-no-norm)
(incf sum-post-prob-no-norm post-prob-no-norm)))
; normalize
(dolist (elem sorted)
(setf (cdr elem) (/ (cdr elem) sum-post-prob-no-norm)))
sorted))
(defun contains-word (store word)
(maphash #'(lambda (k cat-data)
(declare (ignore k))
(aif (gethash word (category-data-word-count cat-data))
(return-from contains-word it)))
(learned-store-category-data-hash store))
nil)
(defun 1+plus (x)
(if (null x)
1
(1+ x)))
(define-modify-macro incf-plus () 1+plus)
@export
(defun learn-a-document (store word-lst category)
(with-slots (category-data-hash num-document num-word-kind) store
(incf num-document)
(slet (gethash category category-data-hash)
(if (null it)
(setf it (make-category-data)))
(incf (category-data-count it))
(dolist (word word-lst)
(incf (category-data-sum-word-count it))
(if (not (contains-word store word))
(incf num-word-kind))
(incf-plus (gethash word (category-data-word-count it)))))))