-
Notifications
You must be signed in to change notification settings - Fork 1
/
naivebayes.py
134 lines (102 loc) · 3.97 KB
/
naivebayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# coding: utf-8
from __future__ import division
'''
Inspired by Luis Munoz's MATLAB code for the Naive Bayes classifier model.
/!\ run with python3
'''
import numpy as np
from helpers.logging import tls, log
def process_parameters(p, tolerance=1e-10):
'''
Helper function for training naivebayes.
Returns parameters where NaNs, zeros and ones have been modified to avoid
under/overflows (??)
Helper function for the training function.
TODO write better docstring and explanation
'''
p[np.isnan(p)] = tolerance
p[p == 0] = tolerance
p[p == 1] = 1 - tolerance
return p
@log
def fit(features, labels,
## params
ham_label,
spam_label=1,
**kwargs
):
'''
Returns the parameters for a Naive Bayes model
Logs are used because otherwise multiplications of very small numbers,
which leads to problems of over/underflows
TRAINING PHASE
Inputs:
- features: N * D Numpy matrix of binary values (0 and 1)
with N: the number of training examples
and D: the number of features for each example
- labels: N * 1 Numpy vector of binary values (0 and 1)
Outputs:
- parameters
'''
## setup
X, Y = features, labels
N, D = X.shape ## number of N: training samples, D: features
tolerance = 1e-30 ## tolerance factor (to avoid under/overflows)
tls.logger.debug('X: (%s, %s)\tY: %s' % (N, D, str(Y.shape)))
## estimate prior probability of spam class
prior_ham = np.sum(Y == ham_label) / N
prior_spam = 1 - prior_ham
tls.logger.debug('- prior ham: %s' % prior_ham)
tls.logger.debug('- prior spam: %s' % prior_spam)
## estimate likelihood parameters for each class
## looks at presence of features in each class
indices_ham = np.ravel(np.where(Y == ham_label))
indices_spam = np.ravel(np.where(Y == spam_label))
N_ham = len(indices_ham)
N_spam = len(indices_spam)
likeli_ham = np.sum(X[indices_ham], axis=0) / N_ham
likeli_spam = np.sum(X[indices_spam], axis=0) / N_spam
likeli_ham, likeli_spam = map(lambda p: p.reshape((D, 1)), [likeli_ham, likeli_spam])
likeli_ham, likeli_spam = map(process_parameters, [likeli_ham, likeli_spam])
tls.logger.debug('- likelihood ham: %s' % np.ravel(likeli_ham))
tls.logger.debug('- likelihood spam: %s' % np.ravel(likeli_spam))
return prior_ham, prior_spam, likeli_ham, likeli_spam
@log
def predict(parameters, features,
## params
ham_label,
spam_label=1,
):
'''
TEST PHASE
Inputs:
- parameters: model parameters
- features
Outputs:
- predicted: labels
'''
## notation
prior_ham, prior_spam, likeli_ham, likeli_spam = parameters
X = features
N, D = X.shape
## apply model
## Bernouilli Naive Bayes, takes into account absence of a feature
log_posterior_ham = np.log(prior_ham) + \
np.dot( X, np.log( likeli_ham)) + \
np.dot((1-X), np.log(1-likeli_ham))
log_posterior_spam = np.log(prior_spam) + \
np.dot( X, np.log( likeli_spam)) + \
np.dot((1-X), np.log(1-likeli_spam))
tls.logger.debug('- log posterior for ham: %s' % np.ravel(log_posterior_ham))
tls.logger.debug('- log posterior for spam: %s' % np.ravel(log_posterior_spam))
## no need to normalise since we are just interested in which
## posterior is higher (ie. which label is most likely given the data)
log_posterior_ham, log_posterior_spam = map(np.ravel, [log_posterior_ham, log_posterior_spam])
## calculate output
## assign class which is most likely over the other
## this works because labels are 0 and 1 for ham and spam respectively
predicted = (log_posterior_spam > log_posterior_ham)
if ham_label == -1:
predicted = predicted * 2 - 1
tls.logger.debug('Predicted: %s' % predicted)
return predicted