/
logistic_regression.py
258 lines (254 loc) · 10.4 KB
/
logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
## Machine Learning Packages based on Theano
## The interface of the machine learning methods
## resembles sklearn BaseEstimator
import theano
import theano.tensor as T
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.cross_validation import train_test_split
class LogisticRegression(BaseEstimator):
def __init__(self, classes, validation_size = 0.2,
optimizer = 'sgd', n_epochs = None, batch_size = None,
learning_rate = None, verbose = True):
"""
optimizer = {'sgd', 'cg'}, generally 'sgd' for large scale and 'cg' for smaller data
n_epochs = number of iterations for minibatch learning, 1000 for sgd, 50 for cg
batch_size = size of the minibatch
learning_rate = only effective for sgd learning
"""
if not all(classes == range(len(classes))):
raise RuntimeError('classes need to be coded as exactly range(n_class)')
self.classes = classes
self.validation_size = validation_size
self.optimize = self._sgd if optimizer == 'sgd' else self._cg
self.n_epochs = n_epochs
self.batch_size = batch_size
self.learning_rate = learning_rate
self.verbose = verbose
self.W_, self.b_ = None, None
def fit(self, X, y):
## re-initialize the parameters anyway
self.n_feats = X.shape[1]
self.n_classes = len(self.classes)
## initialize parameters
if self.optimize == self._cg:
self.theta_ = theano.shared(value = np.zeros(self.n_feats*self.n_classes+self.n_classes,
dtype = theano.config.floatX),
name = 'theta',
borrow = True)
self.W_ = self.theta_[:self.n_feats*self.n_classes].reshape((self.n_feats, self.n_classes))
self.b_ = self.theta_[self.n_feats*self.n_classes:]
elif self.optimize == self._sgd:
self.W_ = theano.shared(value = np.zeros((self.n_feats, self.n_classes),
dtype = theano.config.floatX),
name = 'W', borrow = True)
self.b_ = theano.shared(value = np.zeros((self.n_classes),
dtype = theano.config.floatX),
name = 'W', borrow = True)
else:
raise RuntimeError('Unimplemented Optimization Method')
## optimize
self.optimize(X, y)
return self
def partial_fit(self, X, y):
"""
increamental learning model for new data
"""
## only re-initialize parameter the first time
if self.W_ is None or self.b_ is None:
return self.fit(X, y)
else:
## optimize directly
self.optimize(X, y)
return self
def predict(self, X):
return self._predict(X)[0]
def predict_proba(self, X):
return self._predict(X)[1]
def score(self, X, y):
cost, error = self._build_symbols(X, y)
return error.eval()
def _predict(self, X):
v_X = self._share_data(X)
predict_model = self._build_predict_model(v_X)
y_pred, p_y_given_x = predict_model()
return y_pred, p_y_given_x
def _sgd(self, X, y):
learning_rate = self.learning_rate or 0.13
n_epochs = self.n_epochs or 1000
batch_size = self.batch_size or 600
print 'Training LogisticRegression modle with SGD ...'
## split and share data
train_X, validation_X, train_y, validation_y = train_test_split(
X, y, test_size = self.validation_size)
v_train_X = self._share_data(train_X)
v_validation_X = self._share_data(validation_X)
v_train_y = self._share_data(train_y, dtype='int32')
v_validation_y = self._share_data(validation_y, dtype='int32')
## shape information
## for params used to interact raw_data and tensor_data
## get them as early as possible
## symoblic model functions
train_model = self._build_train_model(v_train_X, v_train_y,
batch_size, learning_rate)
validate_model = self._build_validate_model(v_validation_X,
v_validation_y, batch_size)
## number of batches
n_train_batches = v_train_X.get_value(borrow=True).shape[0] / batch_size
n_validation_batches = v_validation_X.get_value(borrow=True).shape[0] / batch_size
## iterative optimization
## ALGORITHM: iter through epoch -> iter through mini-batch in train
## check validation performance if it is time;
## increase patience limit if validation performance is getting significantly better
## stop if we run out of patience
## ITERATION PARAMETERS
## look at this many examples regardless
patience = 5000
## how much more patience gained when a new significant best achieved
patience_increase = 2
## how much improvement is considered significant
improvement_threshold = 0.995
## frequency of doing validation through iterative optimization
## now we check it every mini_train_batch
validation_frequency = min(n_train_batches, patience / 2)
## iterative process
epoch = 0
out_of_patience = False
best_validation_error = np.inf
best_params = self.W_, self.b_
## each epoch till running out o patience
while (epoch < n_epochs) and (not out_of_patience):
epoch += 1
## each mini train batch
for minibatch_index in xrange(n_train_batches):
## train the model
minibatch_cost = train_model(minibatch_index)
## update the total iteration number
iter = (epoch - 1) * n_train_batches + minibatch_index
## do validation when it is time
if (iter+1) % validation_frequency == 0:
## get the current validation error rate
this_validation_error = np.mean([validate_model(i)
for i in xrange(n_validation_batches)])
if self.verbose:
print 'epoch %i, minibatch %i/%i, validation error %f %%' % (
epoch, minibatch_index+1, n_train_batches,
this_validation_error * 100.
)
## increase the patience if a significant improvement is found
if this_validation_error < best_validation_error:
if this_validation_error < best_validation_error * improvement_threshold:
patience = max(patience, iter*patience_increase)
best_validation_error = this_validation_error
best_params = [self.W_, self.b_]
## if running out of patience, quit the iterative optimization
if patience <= iter:
out_of_patience = True
break
if self.verbose:
print 'Optimization complete with best validation score %f %%' % (best_validation_error * 100.)
## save the best found params based on the validation performance
self.W_, self.b_ = best_params
def _cg(self, X, y):
n_epochs = self.n_epochs or 50
batch_size = self.batch_size or 600
print 'Training LogisticRegression modle with CG ...'
## split and share data
train_X, validation_X, train_y, validation_y = train_test_split(
X, y, test_size = self.validation_size)
v_train_X = self._share_data(train_X)
v_validation_X = self._share_data(validation_X)
v_train_y = self._share_data(train_y, dtype='int32')
v_validation_y = self._share_data(validation_y, dtype='int32')
## build symoblic models
validate_model = self._build_validate_model(v_validation_X,
v_validation_y, batch_size)
train_cost = self._build_train_cost(v_train_X,
v_train_y, batch_size)
train_grad = self._build_train_grad(v_train_X,
v_train_y, batch_size)
## number of batches
n_train_batches = v_train_X.get_value(borrow=True).shape[0] / batch_size
n_validation_batches = v_validation_X.get_value(borrow=True).shape[0] / batch_size
## build helper functions for scipy optimize
def train_fn(theta_value):
self.theta_.set_value(theta_value, borrow = True)
train_loss = np.mean([train_cost(i) for i in xrange(n_train_batches)])
return train_loss
def train_fn_grad(theta_value):
self.theta_.set_value(theta_value, borrow = True)
grad = sum([train_grad(i) for i in xrange(n_train_batches)]) / n_train_batches
return grad
def callback(theta_value):
self.theta_.set_value(theta_value, borrow = True)
validation_loss = np.mean([validate_model(i) for i in xrange(n_validation_batches)])
print 'validation error', validation_loss
import scipy.optimize
best_theta = scipy.optimize.fmin_cg(
f = train_fn,
x0 = np.zeros((self.n_feats+1)*self.n_classes, dtype = v_train_X.dtype),
fprime = train_fn_grad,
callback = callback if self.verbose else None,
disp = 0,
maxiter = n_epochs
)
self.theta_.set_value(best_theta, borrow=True)
def _build_train_model(self, v_train_X, v_train_y, batch_size, learning_rate):
index = T.lscalar()
X = v_train_X[index * batch_size: (index + 1) * batch_size]
y = v_train_y[index * batch_size: (index + 1) * batch_size]
cost, error = self._build_symbols(X, y)
g_W, g_b = T.grad(cost = cost, wrt = [self.W_, self.b_])
updates = [(self.W_, self.W_ - learning_rate*g_W),
(self.b_, self.b_ - learning_rate*g_b)]
train_model = theano.function(inputs = [index],
outputs = cost,
updates = updates)
return train_model
def _build_train_cost(self, v_train_X, v_train_y, batch_size):
index = T.lscalar()
X = v_train_X[index * batch_size: (index + 1) * batch_size]
y = v_train_y[index * batch_size: (index + 1) * batch_size]
cost, error = self._build_symbols(X, y)
train_cost = theano.function(inputs = [index],
outputs = cost)
return train_cost
def _build_train_grad(self, v_train_X, v_train_y, batch_size):
index = T.lscalar()
X = v_train_X[index * batch_size: (index + 1) * batch_size]
y = v_train_y[index * batch_size: (index + 1) * batch_size]
cost, error = self._build_symbols(X, y)
train_grad = theano.function(inputs = [index],
outputs = T.grad(cost, self.theta_))
return train_grad
def _build_validate_model(self, v_validation_X, v_validation_y, batch_size):
index = T.lscalar()
X = v_validation_X[index * batch_size: (index + 1) * batch_size]
y = v_validation_y[index * batch_size: (index + 1) * batch_size]
_, error = self._build_symbols(X, y)
validate_model = theano.function(inputs = [index],
outputs = error)
return validate_model
def _build_predict_model(self, v_X):
X = T.matrix('X')
p_y_given_x = T.nnet.softmax(T.dot(X, self.W_) + self.b_)
classes = theano.shared(value = self.classes, borrow = True)
y_pred = classes[T.argmax(p_y_given_x, axis = 1).reshape((-1,))]
return theano.function(inputs = [],
outputs = [y_pred, p_y_given_x],
givens = {
X: v_X
})
def _build_symbols(self, X, y):
p_y_given_x = T.nnet.softmax(T.dot(X, self.W_) + self.b_)
classes = theano.shared(value = self.classes, borrow = True)
y_pred = classes[T.argmax(p_y_given_x, axis = 1).reshape((-1,))]
error = T.mean(T.neq(y_pred, y))
## HERE NEEDS Y TO BE EXACTULY 0..N_CLASS-1
negative_log_likelihood = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]),y])
return negative_log_likelihood, error
def _share_data(self, data, dtype=theano.config.floatX):
shared_data = theano.shared(value = np.asarray(data,
dtype = theano.config.floatX),
borrow = True)
return T.cast(shared_data, dtype = dtype)