In [1]:
import numpy as np
np.set_printoptions(suppress=True)

# Algorithm

In [2]:
# define hypothesis function
def hypothesis(P, X):
    # define you hypothesis function here...
    return 0

In [3]:
# define loss function
def loss(P, X, Y):
    # define you loss function here...
    return 0

In [4]:
# count gradient
def gradient(P, X, Y):
    nP = len(P)
    D = np.zeros(nP)
    for _i in range(nP):
        dh = 1e-7
        h = np.zeros(nP)
        h[_i] = dh
        D[_i] = (loss(P + h, X, Y) - loss(P, X, Y)) / dh
    return D 

In [5]:
# BGD Algorithm
def BGD(lr, niter, init_P, X, Y, vbose=1000):
    P = init_P
    for _i in range(niter):
        P -= lr * gradient(P, X, Y)
        if (_i % vbose == 0):
            print("iter:%s, loss: %s" % (_i, loss(P, X, Y)))
    return P

# example1

In [6]:
def hypothesis(P, X):
    # h(X) = PX
    bias = np.ones(X.shape[:-1])
    input_ = np.insert(X, -1, bias, axis=-1)  # channel_last
    return np.dot(input_, P)

def loss(P, X, Y):
    # Square Loss
    d = (Y - hypothesis(P, X)) ** 2
    return np.sum(d) / len(d)

In [7]:
X = np.random.rand(2000, 3)                                # random samples
para_t = np.array([115, -12, -3, 44],dtype=np.float64)     # true_parameters
Y = hypothesis(para_t, X) + np.random.rand(1)
print(X)
print(Y)

[[ 0.0209773   0.55442478  0.40230261]
 [ 0.91941811  0.93079949  0.891273  ]
 [ 0.92775917  0.36862968  0.77294295]
 ..., 
 [ 0.54159876  0.86101946  0.66561922]
 [ 0.34669842  0.76764868  0.41479685]
 [ 0.3830076   0.2927962   0.87878888]]
[  11.40233849  131.72123201  134.21997016 ...,   79.1806016    46.8513268
   77.14076235]


In [8]:
para = np.array([0, 0, 0, 0],dtype=np.float64)              # init_para
print("Para_h: %s" % BGD(0.01, 10000, para, X, Y))
print("Para_t: %s" % para_t)

iter:0, loss: 6051.55206953
iter:1000, loss: 51.0304688772
iter:2000, loss: 5.0273756556
iter:3000, loss: 0.68016847629
iter:4000, loss: 0.101467038909
iter:5000, loss: 0.0154896290322
iter:6000, loss: 0.00237660362235
iter:7000, loss: 0.000365047416709
iter:8000, loss: 5.60845545895e-05
iter:9000, loss: 8.61695483353e-06
Para_h: [ 114.99777458  -12.0023768    -2.05467269   43.99780741]
Para_t: [ 115.  -12.   -3.   44.]


# example2

In [9]:
def reLU(X):
    return X * (X > 0)

def hypothesis(P, X):
    # ANN(2 Layers)
    nX = X.shape[:-1]# channel_last
    p1 = np.reshape(P[:20], [4, 5])
    b1 = np.reshape(P[20:25], [1, 5])
    p2 = np.reshape(P[25:40], [5, 3])
    b2 = np.reshape(P[40:43], [1, 3])
    p3 = np.reshape(P[43:46], [3, 1])
    b3 = np.reshape(P[46:], [1, 1])
    nn1 = reLU(np.dot(X, p1) + b1)
    nn2 = reLU(np.dot(nn1, p2) + b2)
    return np.reshape(reLU(np.dot(nn2, p3) + b3), nX)

def loss(P, X, Y):
    # Square Loss
    d = (Y - hypothesis(P, X)) ** 2
    return np.sum(d) / len(d)

In [10]:
# Load Data
data = np.loadtxt(r'data.csv', delimiter=',')
Y = data[:, 0]
X = data[:, 1:]
X.shape

(145L, 4L)

In [11]:
N = 100
Ytrain = Y[:N]
Xtrain = X[:N, :]
Ytest = Y[N:]
Xtest = X[N:, :]

In [12]:
para = np.random.rand(47)
print(para)
print(BGD(1e-3, 5000, para, Xtrain, Ytrain, 1000))
print("RMSE:%s" % loss(para, Xtest, Ytest) ** 0.5)

[ 0.11502993  0.7358873   0.84195528  0.15264337  0.70938728  0.40116462
  0.29715343  0.01161423  0.98751459  0.51898138  0.3052794   0.97377811
  0.24570355  0.27332767  0.41296655  0.6270404   0.01407165  0.94889511
  0.92389007  0.21228204  0.40835712  0.73435473  0.47494257  0.96420357
  0.16536048  0.58040284  0.71845432  0.02250436  0.39780797  0.52894973
  0.6271247   0.24378116  0.70566794  0.75150505  0.52303977  0.46572041
  0.38570375  0.82973294  0.22769555  0.00956479  0.88959746  0.08604776
  0.74092044  0.39916155  0.48975517  0.16620989  0.08976887]
iter:0, loss: 32.43822935
iter:1000, loss: 32.43822935
iter:2000, loss: 32.43822935
iter:3000, loss: 32.43822935
iter:4000, loss: 32.43822935
[-1499.87007729 -1332.71624756 -1449.43622863 -1279.39304842 -1134.08999165
    -2.30927045    -2.11235968    -2.60899944    -1.32458917    -1.5315744
   -35.5721882    -30.92046242   -34.44284464   -30.33154564   -26.72979193
  -239.70794445  -213.63817917  -231.42180092  -204.09121 

In [13]:
print("Y_hat, Y, dY")
print(np.stack([hypothesis(para, Xtest), Ytest, Ytest - hypothesis(para, Xtest)], axis=1))

Y_hat, Y, dY
[[  -0.      11.982   11.982]
 [  -0.      16.674   16.674]
 [  -0.      12.62    12.62 ]
 [  -0.      12.905   12.905]
 [  -0.      11.615   11.615]
 [  -0.       9.321    9.321]
 [  -0.      12.962   12.962]
 [  -0.      16.932   16.932]
 [  -0.       9.648    9.648]
 [  -0.      18.35    18.35 ]
 [  -0.      17.333   17.333]
 [  -0.      12.015   12.015]
 [  -0.      11.32    11.32 ]
 [  -0.      22.337   22.337]
 [  -0.      19.035   19.035]
 [  -0.      12.205   12.205]
 [  -0.      17.078   17.078]
 [  -0.      25.528   25.528]
 [  -0.      24.021   24.021]
 [  -0.      32.197   32.197]
 [  -0.      26.652   26.652]
 [  -0.      20.164   20.164]
 [  -0.      14.132   14.132]
 [  -0.      21.41    21.41 ]
 [  -0.      23.244   23.244]
 [  -0.      29.845   29.845]
 [  -0.      32.318   32.318]
 [  -0.      21.988   21.988]
 [  -0.      35.229   35.229]
 [  -0.      17.467   17.467]
 [  -0.      22.828   22.828]
 [  -0.      33.154   33.154]
 [  -0.      32.228   32.22

In [14]:
print(gradient(para, Xtest, Ytest))

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [15]:
para = np.array([0.18031512, 0.26144132, 0.38982893, 0.24511579, 0.05005886,
                 0.54133903, 0.24155313, 0.19897874, 0.75746696, 0.15247064,
                 0.98798449, 0.16585703, 0.56516654, 0.74297450, 0.17421394,
                 0.05613916, 0.63075679, 0.97097814, 0.21706297, 0.93147248,
                 0.57844359, 0.33599461, 0.92718726, 0.64963381, 0.94634624,
                 0.12168908, 0.74619497, 0.91831590, 0.97285490, 0.51888180,
                 0.83496196, 0.32444122, 0.35395017, 0.83409773, 0.23632133,
                 0.03673830, 0.03378825, 0.35617706, 0.21709822, 0.78136090,
                 0.02825722, 0.46167223, 0.76209731, 0.06966091, 0.38056810,
                 0.15605312, 0.53215890])
print(para)
print(BGD(1e-7, 5000, para, Xtrain, Ytrain, 1000))
print("RMSE:%s" % loss(para, Xtest, Ytest) ** 0.5)

[ 0.18031512  0.26144132  0.38982893  0.24511579  0.05005886  0.54133903
  0.24155313  0.19897874  0.75746696  0.15247064  0.98798449  0.16585703
  0.56516654  0.7429745   0.17421394  0.05613916  0.63075679  0.97097814
  0.21706297  0.93147248  0.57844359  0.33599461  0.92718726  0.64963381
  0.94634624  0.12168908  0.74619497  0.9183159   0.9728549   0.5188818
  0.83496196  0.32444122  0.35395017  0.83409773  0.23632133  0.0367383
  0.03378825  0.35617706  0.21709822  0.7813609   0.02825722  0.46167223
  0.76209731  0.06966091  0.3805681   0.15605312  0.5321589 ]
iter:0, loss: 56115.2138837
iter:1000, loss: 2.58377124655
iter:2000, loss: 2.20030671483
iter:3000, loss: 1.88540868509
iter:4000, loss: 1.6693258371
[ 0.09804242  0.21426132  0.35534215  0.2449763   0.0293073   0.5412685
  0.24145624  0.19890934  0.75745327  0.15240756  0.98741808  0.16454579
  0.56423266  0.74273672  0.17327861  0.0499122   0.62212443  0.96479593
  0.21583607  0.9258397   0.57840696  0.335945    0.92715172

In [16]:
print("Y_hat, Y, dY")
print(np.stack([hypothesis(para, Xtest), Ytest, Ytest - hypothesis(para, Xtest)], axis=1))

Y_hat, Y, dY
[[  12.94779536   11.982        -0.96579536]
 [  13.91592103   16.674         2.75807897]
 [  14.50662844   12.62         -1.88662844]
 [  13.98164501   12.905        -1.07664501]
 [  13.58468011   11.615        -1.96968011]
 [  13.53883282    9.321        -4.21783282]
 [  15.09846598   12.962        -2.13646598]
 [  16.56636125   16.932         0.36563875]
 [  14.16351877    9.648        -4.51551877]
 [  16.20534652   18.35          2.14465348]
 [  16.09374572   17.333         1.23925428]
 [  15.1957621    12.015        -3.1807621 ]
 [  16.6266333    11.32         -5.3066333 ]
 [  19.59711161   22.337         2.73988839]
 [  20.05470698   19.035        -1.01970698]
 [  19.70285744   12.205        -7.49785744]
 [  20.83927246   17.078        -3.76127246]
 [  22.58207847   25.528         2.94592153]
 [  22.71320348   24.021         1.30779652]
 [  25.21393977   32.197         6.98306023]
 [  24.89956644   26.652         1.75243356]
 [  25.49845204   20.164        -5.3344520

In [17]:
print(gradient(para, Xtest, Ytest))

[  186.86534503    -4.51131314     1.61126238   -28.17512424   -36.96292964
     0.6940455     -0.01675858     0.00598718    -0.1046908     -0.13734322
     4.77533902    -0.11532421     0.04118206    -0.72026893    -0.94493657
    76.01309647    -1.83569966     0.65563199   -11.46518059   -15.04137941
     0.48372556    -0.01168431     0.00417174    -0.07295952    -0.09571693
   -17.87732032    59.51485861   -15.15835308   -57.52956412   191.52660911
   -48.77993092   -93.02748936   309.72683106   -78.87918528   -43.29830503
   144.1567025    -36.71317131   -50.25134854   167.28076076   -42.60851398
    -0.31470066     1.04758399    -0.26683352  1489.33174643  1046.11175971
  2328.47006998     4.02859825]
