-
Notifications
You must be signed in to change notification settings - Fork 1
/
L_BFGS.py
98 lines (78 loc) · 3.12 KB
/
L_BFGS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding=utf-8
import numpy as np
import crf
def twoloop(s_list, y_list, grad):
"""
Subroutine of L-BFGS: two-loop recursion.
:param s_list: delta_x list : s_k = x_k+1 - x_k.
:param y_list: delta_grad list : y_k = grad(f)_k+1 - grad(f)_k
:param grad: gradient of this iter.
:return: H_k * grad_k ( negitive descent direction)
"""
m = len(s_list) # length of memerized s_list and y_list.
if np.shape(s_list)[0] >= 1: # h0 is scale(it represents the matrix H_0).think why.
h0 = 1.0 * np.dot(s_list[-1], y_list[-1]) / np.dot(y_list[-1], y_list[-1]) # init h0.
else:
h0 = 1
a = np.empty((m,))
q = grad.copy()
rho = np.zeros((1, m))
for i in range(m - 1, -1, -1):
rho[i] = 1.0 / np.dot(s_list[i], y_list[i])
a[i] = rho[i] * np.dot(s_list[i], q)
q -= a[i] * y_list[i]
r = h0 * q
for i in range(m):
b = rho[i] * np.dot(y_list[i], r)
r += s_list[i] * (a[i] - b)
return r
def lbfgs(x, corps,
featureTS, words2tagids,
priorfeatureE, m=10, ITER_MAX=100, fun=crf.getnegloglikelihood, gfun=crf.getgradients):
"""
If you want to know the principle of L-BFGS, please reference:
"Numerical Optimization(2nd Edition)" chapter7
Algorithm 7.4 and Algorithm 7.5 in page 178.
:param x: variable to be optimized.
:param corps:
:param featureTS:
:param words2tagids:
:param priorfeatureE:
:param m: length of memerized list.
:param ITER_MAX: The maximum number of iterations
:param fun:
:param gfun:
:return:
"""
beta = 0.2 # parameter 'beta' for Backtracking line search method, it control the descent speed of t.
alpha = 0.4 # another para in Backtracking line search method.
epsilon = 1e-4 # stop threshold of optimization
s, y = [], []
iter = 0
while iter < ITER_MAX:
grad = gfun(priorfeatureE, x, corps, featureTS, words2tagids)
if np.linalg.norm(grad) < epsilon:
break
delta_x = -1.0 * twoloop(s, y, grad) # descent step
# Backtracking linear search method
t = 1.0 # step length coefficient
funcvalue = fun(x, corps, featureTS, words2tagids)
# backtracking line search is very costly. Because every try of x_new need calculate fun(likelihood) which need
# calculate log_Phi again.
while (fun(x + t * delta_x, corps, featureTS, words2tagids) > funcvalue + alpha * t * np.dot(
grad, delta_x)):
t *= beta
t *= beta
x_new = x + t * delta_x
sk = x_new - x
yk = gfun(priorfeatureE, x_new, corps, featureTS, words2tagids) - grad
if np.dot(sk, yk) > 0: # add new sk, yk to s_list,y_list.
s.append(sk)
y.append(yk)
if np.shape(s)[0] > m: # discard the oldest sk, yk.
s.pop(0)
y.pop(0)
iter += 1
x = x_new
print 'iterations:%d, func_value:%f' % (iter, funcvalue)
return x, fun(x, corps, featureTS, words2tagids)