##Not pretty

In [None]:

import numpy as np

class mult:
  "Слой перемножения матриц: y = X @ W"
  def __init__(self,w,x):
    self.w,self.x=w,x
  def __call__(self):
    self.res=self.x@self.w
    return self.res
  def backward(self,grad):
    self.w_grad=self.x.T@grad
    self.x_grad=grad@self.w.T
    return self.x_grad

class addC:
  "Добавление константы (смещения)"
  def __init__(self,b):
    self.b=b
  def __call__(self,res):
    self.res=res+self.b
    return self.res
  def backward(self,grad):
    self.b_grad=np.sum(grad,axis=0)
    return grad

class LL:
  "Линейный слой"
  def __init__(self,w,b):
    self.w,self.b=w*np.sqrt(2/w.shape[0]),b
    self.grad=None
    self.l,self.x=None,None
  def forward(self,l):
    if isinstance(l,LL):
      self.l=l
      self.x=self.l.y
    else:
      self.x=l
    self.mult=mult(self.w,self.x)
    self.add=addC(self.b)
    self.y=self.add(self.mult())
    return self
  def backward(self,grad):
    grad=self.add.backward(grad)
    grad=self.mult.backward(grad)
    self.w_grad=self.mult.w_grad
    self.b_grad=self.add.b_grad
    #print(f'LAYER w grad: {self.wgrad}, b grad: {self.bgrad}')
    return grad

class relu:
  "Нелинейность, ReLU"
  def __init__(self):
    self.grad,self.x=None,None
  def forward(self,x):
    self.x=x
    self.y=np.clip(self.x,0,np.inf)
    return self
  def backward(self,grad):
    self.grad=(self.x > 0).astype(float)
    #print(f'RELU grad: {self.grad}')
    return grad*self.grad

class mse:
  "Ошибка по формуле MSE"
  def __init__(self,ytrue):
    self.ytrue=ytrue
    self.grad=None
  def forward(self,ypred):
    self.ypred=ypred
    self.n=len(self.ytrue)
    self.sq_er=(self.ypred-self.ytrue)**2
    self.sum_ll=sumE()
    self.loss=self.sum_ll(self.sq_er)/self.n
    return self
  def backward(self):
    self.grad=2*(self.ypred-self.ytrue)/self.n
    #print(f'MSE grad: {self.grad}')
    return self.grad[None,:]

class sumE:
  "Сложение элементов одной матрицы"
  def __init__(self):
    self.shape=None
  def forward(self,x):
    self.shape=x.shape
    self.y=x.sum()
    return self.y
  def __call__(self,x):
    return self.forward(x)
  def backward(self,grad):
    self.grad=np.ones(self.shape)
    return self.grad*grad

class sumM:
  "Сложение матриц"
  def __init__(self):
    self.shapes=None
  def forward(self,tensors):
    self.shapes=[t.shape for t in tensors]
    self.y=tensors[0]
    for t in tensors[1:]:
      self.y+=t
    return self
  def backward(self,grad):
    return [grad]*len(self.shapes)

class concat:
  "Конкатенация тензоров"
  def __init__(self,axis):
    self.shapes=None
    self.axis=axis
  def forward(self,*tensors):
    self.shapes=[t.shape for t in tensors]
    self.y=np.concatenate(tensors,axis=self.axis)
    return self
  def __call__(self,*tensors):
    return self.forward(*tensors)
  def backward(self,grad):
    grads=[]
    start_idx=0
    for shape in self.shapes:
      end_idx=start_idx+shape[self.axis]
      slices=[]
      for dim in range(grad.ndim):
        if dim==self.axis:
          slices.append(slice(start_idx,end_idx))
        else:
          slices.append(slice(None))
      grads.append(grad[tuple(slices)])
      start_idx=end_idx
    return grads

In [None]:
class Optimizer:
  def __init__(self,x,y,layers,lr=0.001,type='adamw',betas=(0.9,0.99),wd=0.001,eps=10**(-8)):
    self.x,self.y=x,y
    self.type=type.lower()
    self.layers=layers
    self.denses=layers.getLLs()
    self.lr,self.wd=lr,wd
    self.mom1,self.mom2=betas
    self.eps=eps
    self.params=layers.params()
    self.step_count=0
    self.m=[np.zeros_like(p) for p in self.params]
    self.v=[np.zeros_like(p) for p in self.params]
  def epoch(self,x,y,i):
    for layer in self.layers.lls:
        x=layer.forward(x).y

    self.loss=mse(y).forward(x)
    print(f'{i+1}. loss:',self.loss.loss)

    self.grad=self.loss.backward()
    for layer in reversed(self.layers.lls):
      self.grad=layer.backward(self.grad)

  def step(self):
    self.step_count+=1
    if self.type=='sgd':
      for i, ll in enumerate(self.denses):
        grad_mean_w=ll.w_grad.mean(axis=0)
        grad_mean_b=ll.b_grad.mean(axis=0)
        ll.w-=self.lr*grad_mean_w
        ll.b-=self.lr*grad_mean_b
    elif self.type=='adamw':
      for i, ll in enumerate(self.denses):
        grad_mean_w=ll.w_grad.mean(axis=0)
        self.m[i]=self.mom1*self.m[i]+(1-self.mom1)*grad_mean_w
        self.v[i]=self.mom2*self.v[i]+(1-self.mom2)*(grad_mean_w**2)
        m_corr=self.m[i]/(1-self.mom1**self.step_count)
        v_corr=self.v[i]/(1-self.mom2**self.step_count)
        ll.w-=self.lr*(m_corr / (np.sqrt(v_corr) + self.eps) + self.wd*ll.w)

        grad_mean_b=ll.b_grad.mean(axis=0)
        ll.b-=self.lr*grad_mean_b
  def fit(self,steps):
    for i in range(steps):
      self.epoch(self.x,self.y,i)
      self.step()

In [None]:
class Layers:
  def __init__(self,lls):
    self.lls=lls
    self.parameters=[]
  def getLLs(self):
    self.dense_lls=[]
    for ll in self.lls:
      if type(ll)==LL:
        self.dense_lls.append(ll)
    return self.dense_lls
  def params(self):
    self.parameters=[]
    for ll in self.lls:
      if type(ll)==LL:
        self.parameters.append(ll.w)
    return self.parameters

In [None]:
ll1=LL(np.random.rand(20,15),np.random.rand(15))
ll2=relu()
ll3=LL(np.random.rand(15,10),np.random.rand(10))
ll4=relu()
ll5=LL(np.random.rand(10,5),np.random.rand(5))
ll6=relu()
ll7=LL(np.random.rand(5,1),np.random.rand(1))
layers=[ll1,ll2,ll3,ll4,ll5,ll6,ll7]
layers=Layers(layers)
x,y=np.random.rand(5,20),np.random.rand(5).reshape(-1,1)
opt=Optimizer(x,y,layers,lr=0.1,betas=(0.9,0.99))
opt.fit(4)

1. loss: 479.7989441964023
2. loss: 0.22695112268683335
3. loss: 0.5507728725686756
4. loss: 0.6964999874120906


###Iris classifying

In [None]:
from sklearn import datasets

# Load the Iris dataset
iris = datasets.load_iris()

# Access features (X) and target (y)
X = iris.data  # Features (sepal length, sepal width, petal length, petal width)
y = iris.target # Target (species: 0 for setosa, 1 for versicolor, 2 for virginica)

# Access feature names and target names
feature_names = iris.feature_names
target_names = iris.target_names

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print("Feature names:", feature_names)
print("Target names:", target_names)

Features (X) shape: (150, 4)
Target (y) shape: (150,)
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']


In [None]:
y=y[:,None]

In [None]:
ll1=LL(np.random.rand(4,15),np.random.rand(15))
ll2=relu()
ll3=LL(np.random.rand(15,10),np.random.rand(10))
ll4=relu()
ll5=LL(np.random.rand(10,5),np.random.rand(5))
ll6=relu()
ll7=LL(np.random.rand(5,1),np.random.rand(1))
layers=[ll1,ll2,ll3,ll4,ll5,ll6,ll7]
layers=Layers(layers)
opt=Optimizer(X,y,layers,lr=0.1,betas=(0.5,0.5))
opt.fit(50)

1. loss: 819454.0391322155
2. loss: 1590.162177188977
3. loss: 93.03753718906862
4. loss: 6.8514885148656735
5. loss: 2.076232445048056
6. loss: 2.8053036626956978
7. loss: 3.5277286962525576
8. loss: 3.998147827762829
9. loss: 4.298224286985275
10. loss: 4.489563474382215
11. loss: 4.622065489470174
12. loss: 4.71481285108289
13. loss: 4.779711645410926
14. loss: 4.826985584508983
15. loss: 4.859029159165757
16. loss: 4.881541040250126
17. loss: 4.895723989499301
18. loss: 4.902883091403396
19. loss: 4.906568804543523
20. loss: 4.907037143144102
21. loss: 4.903682464937312
22. loss: 4.896529746267512
23. loss: 4.88660409162969
24. loss: 4.873331661348823
25. loss: 4.85634394473958
26. loss: 4.834620234537198
27. loss: 4.805915241973571
28. loss: 4.766364689604884
29. loss: 4.701626669165847
30. loss: 4.579219579674438
31. loss: 4.2447794608406335
32. loss: 3.3404093742031575
33. loss: 1.5476209543377437
34. loss: 0.5567548623235404
35. loss: 0.499567036567186
36. loss: 0.6569693226592

In [None]:
ll7.y[:50].mean(),ll7.y[50:100].mean(),ll7.y[100:].mean()

(np.float64(-0.09704974280592289),
 np.float64(0.9624718185878698),
 np.float64(1.7032070809668485))

###Housing

In [None]:
from sklearn.datasets import fetch_california_housing

# Load the dataset
housing = fetch_california_housing()

# Access features and target
X = housing.data  # Features
y = housing.target[:,None]  # Target (median house value)

In [None]:
ll1=LL(np.random.rand(8,150),np.random.rand(150))
ll2=relu()
ll3=LL(np.random.rand(150,100),np.random.rand(100))
ll4=relu()
ll5=LL(np.random.rand(100,50),np.random.rand(50))
ll6=relu()
ll7=LL(np.random.rand(50,1),np.random.rand(1))
layers=[ll1,ll2,ll3,ll4,ll5,ll6,ll7]
layers=Layers(layers)
opt=Optimizer(X,y,layers,lr=0.01,betas=(0.9,0.99))
opt.fit(40)

1. loss: 20125856734.039833
2. loss: 7511471032.612467
3. loss: 2631984486.001109
4. loss: 878349412.7770873
5. loss: 276030781.8890592
6. loss: 78761380.25008124
7. loss: 18965288.410181224
8. loss: 3277109.2206771155
9. loss: 276219.62045823276
10. loss: 6448.968111263429
11. loss: 144.06329621914762
12. loss: 10.145576309419875
13. loss: 2.015314044121069
14. loss: 2.1112561333893787
15. loss: 2.2060471999775104
16. loss: 2.297279697382872
17. loss: 2.3843438864131365
18. loss: 2.4670683569830794
19. loss: 2.5452726579384026
20. loss: 2.6189822083361918
21. loss: 2.68814855965768
22. loss: 2.7528260351475904
23. loss: 2.813175095290644
24. loss: 2.8693582084340776
25. loss: 2.92157476541411
26. loss: 2.9700206760535677
27. loss: 3.0148964164700436
28. loss: 3.056408784857749
29. loss: 3.0947656496905465
30. loss: 3.130165677696078
31. loss: 3.1628049977557633
32. loss: 3.192870004118795
33. loss: 3.2205401704327166
34. loss: 3.245984678068353
35. loss: 3.269365310043478
36. loss: 3.

idk if it works but numbers do go down

##Pretty

In [183]:
import numpy as np

# ===== Ядро =====

class Mult:
  """Слой перемножения матриц: y = X @ W"""
  def __init__(self, W: np.ndarray):
    self.W = W
    self.x = None
    self.W_grad = None
    self.x_grad = None

  def __call__(self, x: np.ndarray) -> np.ndarray:
    self.x = x
    return x @ self.W

  def backward(self, grad: np.ndarray) -> np.ndarray:
    self.W_grad = self.x.T @ grad
    self.x_grad = grad @ self.W.T
    return self.x_grad


class AddC:
  """Добавление константы (смещения): y = X + b"""
  def __init__(self, b: np.ndarray):
    self.b = b
    self.b_grad = None

  def __call__(self,x: np.ndarray) -> np.ndarray:
    return x + self.b

  def backward(self, grad: np.ndarray) -> np.ndarray:
    self.b_grad = grad.sum(axis=0)
    return grad


class Linear:
  """Линейный слой: y = X @ W + b"""
  def __init__(self, in_features: int, out_features: int):
    # инициализация Кайминга
    self.W = np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features)
    self.b = np.random.rand(out_features)
    self.mult = Mult(self.W)
    self.add = AddC(self.b)

  def __call__(self,  x: np.ndarray) -> np.ndarray:
    return self.forward(x)

  def forward(self, x: np.ndarray) -> np.ndarray:
    self.x = x
    z = self.mult(x)
    self.y = self.add(z)
    return self.y

  def backward(self, grad: np.ndarray) -> np.ndarray:
    grad = self.add.backward(grad)
    grad = self.mult.backward(grad)
    self.W_grad = self.mult.W_grad
    self.b_grad = self.add.b_grad
    return grad


class ReLU:
  """Функция активации ReLU"""
  def __call__(self, x: np.ndarray) -> np.ndarray:
    self.x = x
    return np.clip(x, 0, np.inf)

  def backward(self, grad: np.ndarray) -> np.ndarray:
    self.grad = (self.x > 0).astype(float)
    return grad * self.grad


class MSE:
  """Фукнция потерь по формуле MSE"""
  def __call__(self, y_pred: np.ndarray, y_true: np.ndarray) -> float:
    n = y_true.shape[0]
    sq_er = (y_pred - y_true) ** 2
    self.sum_layer = SumE()
    loss = self.sum_layer(sq_er) / n
    return loss

  def backward(self, y_pred, y_true) -> np.ndarray:
    n = y_true.shape[0]
    grad = 2 * (y_pred - y_true) / n
    return self.sum_layer.backward(grad)

class SumE:
  """Сложение элементов одной матрицы"""
  def __init__(self):
    self.shape = None

  def forward(self, x):
    self.shape = x.shape
    self.y = x.sum()
    return self.y

  def __call__(self, x):
    return self.forward(x)

  def backward(self, grad):
    self.grad = np.ones(self.shape)
    return self.grad * grad


class SumM:
  """Сложение матриц"""
  def __init__(self):
    self.shapes=None

  def forward(self,*tensors):
    self.shapes=[t.shape for t in tensors]
    y = sum(tensors)
    return y

  def __call__(self,*x):
    return self.forward(*x)

  def backward(self,grad):
    return [grad]*len(self.shapes)


class Concat:
  """Конкатенация тензоров"""
  def __init__(self,axis):
    self.shapes=None
    self.axis=axis

  def forward(self,*tensors):
    self.shapes=[t.shape for t in tensors]
    y=np.concatenate(tensors,axis=self.axis)
    return y

  def __call__(self,*tensors):
    return self.forward(*tensors)

  def backward(self,grad):
    grads=[]
    start_idx=0
    for shape in self.shapes:
      end_idx=start_idx+shape[self.axis]
      slices=[]
      for dim in range(grad.ndim):
        if dim==self.axis:
          slices.append(slice(start_idx,end_idx))
        else:
          slices.append(slice(None))
      grads.append(grad[tuple(slices)])
      start_idx=end_idx
    return grads

In [4]:
# ===== Оптимизатор =====
from tqdm.auto import tqdm

class Optimizer:
  def __init__(self, x, y, model, form = 'adamw', loss_fn = MSE(), val=(None, None), lr=1e-3, betas=(0.9, 0.999), wd=1e-2, eps=1e-8):
    self.x, self.y = x, y
    self.form = form.lower()
    self.model = model
    self.loss_fn = loss_fn
    self.val_x, self.val_y = val
    self.lr, self.wd = lr, wd
    self.mom1, self.mom2 = betas
    self.eps=eps
    self.step_count=0

    #параметры
    self.layers = [l for l in model.layers if isinstance(l, Linear)]

    self.m_w = [np.zeros_like(l.W) for l in self.layers]
    self.v_w = [np.zeros_like(l.W) for l in self.layers]
    self.m_b = [np.zeros_like(l.b) for l in self.layers]
    self.v_b = [np.zeros_like(l.b) for l in self.layers]

  def step(self):
    self.step_count += 1

    if self.form == 'sgd':
      for i, layer in enumerate(self.layers):
        grad_w = layer.W_grad
        grad_b = layer.b_grad

        layer.W -= self.lr * grad_w
        layer.b -= self.lr * grad_b

    elif self.form == 'adamw':
      for i, layer in enumerate(self.layers):
        grad_w = layer.W_grad
        grad_b = layer.b_grad

        #инерция
        self.m_w[i] = self.mom1 * self.m_w[i] + (1 - self.mom1) * grad_w
        self.v_w[i] = self.mom2 * self.v_w[i] + (1 - self.mom2) * (grad_w ** 2)
        self.m_b[i] = self.mom1 * self.m_b[i] + (1 - self.mom1) * grad_b
        self.v_b[i] = self.mom2 * self.v_b[i] + (1 - self.mom2) * (grad_b ** 2)

        m_w_corr = self.m_w[i] / (1 - self.mom1 ** self.step_count)
        v_w_corr = self.v_w[i] / (1 - self.mom2 ** self.step_count)
        m_b_corr = self.m_b[i] / (1 - self.mom1 ** self.step_count)
        v_b_corr = self.v_b[i] / (1 - self.mom2 ** self.step_count)

        layer.W -= self.lr * self.wd * layer.W
        layer.W -= self.lr * (m_w_corr / (np.sqrt(v_w_corr) + self.eps))

        layer.b -= self.lr * (m_b_corr / (np.sqrt(v_b_corr) + self.eps))

  def epoch(self, x, y):
    pred = self.model.forward(x)
    loss = self.loss_fn(pred, y)

    grad = self.loss_fn.backward(pred, y)
    self.model.backward(grad)
    return loss

  def fit(self, steps):
    progress_bar = tqdm(range(steps))
    for i in progress_bar:
      loss = self.epoch(self.x, self.y)

      if self.val_x is not None:
        val_pred = self.model.forward(self.val_x)
        val_loss = self.loss_fn(val_pred, self.val_y)
        progress_bar.write(f"{self.step_count}. Train loss: {loss:.6f}, validation loss: {val_loss:.6f}")
      else:
        progress_bar.write(f"{self.step_count}. Train loss: {loss:.6f}")

      self.step()

In [5]:
class Model:
  def __init__(self, layers):
    self.layers=layers

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

  def backward(self, grad):
    for layer in reversed(self.layers):
      if hasattr(layer, 'backward'):
        grad = layer.backward(grad)
    return grad

###Validation test

In [154]:
model = Model([
        Linear(20, 15), ReLU(),
        Linear(15, 10), ReLU(),
        Linear(10, 5), ReLU(),
        Linear(5, 2)
])

#случайные данные с разным распределением, ошибка на валидации должна оставаться
X = np.random.rand(5,20)
y = np.random.rand(5,2)
X_val = np.random.rand(2,20) * 100 + 50
y_val = np.random.rand(2,2) * 0.1 - 5

opt = Optimizer(X,y,model, val=(X_val,y_val))
opt.fit(30)

  0%|          | 0/30 [00:00<?, ?it/s]

0. Train loss: 1.136572, validation loss: 44.829052
1. Train loss: 1.034230, validation loss: 48.447307
2. Train loss: 0.938136, validation loss: 51.807335
3. Train loss: 0.848309, validation loss: 53.104535
4. Train loss: 0.765176, validation loss: 54.505295
5. Train loss: 0.689312, validation loss: 55.823606
6. Train loss: 0.623331, validation loss: 57.125582
7. Train loss: 0.567421, validation loss: 58.512243
8. Train loss: 0.527502, validation loss: 59.980287
9. Train loss: 0.496438, validation loss: 61.575485
10. Train loss: 0.468988, validation loss: 62.281105
11. Train loss: 0.444185, validation loss: 62.302628
12. Train loss: 0.421614, validation loss: 62.324059
13. Train loss: 0.400970, validation loss: 62.345391
14. Train loss: 0.381982, validation loss: 62.366616
15. Train loss: 0.364370, validation loss: 62.387721
16. Train loss: 0.347893, validation loss: 62.408696
17. Train loss: 0.332356, validation loss: 62.429528
18. Train loss: 0.317599, validation loss: 62.450205
19.

In [155]:
#валидация на той же функции, что и тренировка, ошибка на валидации должна опускаться
def real_func(x):
    return np.sin(x[:, 0:1] * 2 * np.pi) + 0.1 * np.random.randn(x.shape[0], 2)

X = np.random.rand(100, 20)
y = real_func(X)
X_val = np.random.rand(20,20)
y_val = real_func(X_val)

opt = Optimizer(X,y,model, val=(X_val,y_val))
opt.fit(15)

  0%|          | 0/15 [00:00<?, ?it/s]

0. Train loss: 1.383958, validation loss: 1.516581
1. Train loss: 1.362474, validation loss: 1.502840
2. Train loss: 1.342718, validation loss: 1.489635
3. Train loss: 1.324666, validation loss: 1.476133
4. Train loss: 1.307909, validation loss: 1.463433
5. Train loss: 1.292184, validation loss: 1.451298
6. Train loss: 1.277444, validation loss: 1.439743
7. Train loss: 1.263388, validation loss: 1.428967
8. Train loss: 1.249910, validation loss: 1.418717
9. Train loss: 1.237126, validation loss: 1.408589
10. Train loss: 1.224595, validation loss: 1.399285
11. Train loss: 1.213394, validation loss: 1.390724
12. Train loss: 1.203499, validation loss: 1.384070
13. Train loss: 1.194322, validation loss: 1.378504
14. Train loss: 1.186267, validation loss: 1.375091


In [149]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()

X = housing.data
y = housing.target[:, None]
X_train, X_val, y_train, y_val = train_test_split(X,y)
model = Model([
        Linear(8, 150), ReLU(),
        Linear(150, 50), ReLU(),
        Linear(50, 1)
])
opt=Optimizer(X_train,y_train,model,val=(X_val, y_val),lr=0.1)
opt.fit(20)

  0%|          | 0/20 [00:00<?, ?it/s]

0. Train loss: 1059012.679459, validation loss: 1008847.347886
1. Train loss: 305131235.167306, validation loss: 292202523.508772
2. Train loss: 5495440.187411, validation loss: 5260216.855171
3. Train loss: 5.759236, validation loss: 5.886300
4. Train loss: 8.590898, validation loss: 8.551802
5. Train loss: 504.252582, validation loss: 491.130208
6. Train loss: 504629.727727, validation loss: 482634.333950
7. Train loss: 13.426520, validation loss: 13.189935
8. Train loss: 3.947811, validation loss: 3.986313
9. Train loss: 3.861136, validation loss: 3.898829
10. Train loss: 3.784998, validation loss: 3.821969
11. Train loss: 3.717715, validation loss: 3.754060
12. Train loss: 1158.454052, validation loss: 1100.317638
13. Train loss: 3.590584, validation loss: 3.626147
14. Train loss: 3.557855, validation loss: 3.592603
15. Train loss: 3.515477, validation loss: 3.549798
16. Train loss: 3.477409, validation loss: 3.511342
17. Train loss: 3.443144, validation loss: 3.476726
18. Train lo

In [152]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target[:, None]
X_train, X_val, y_train, y_val = train_test_split(X,y)

model = Model([
        Linear(10, 150), ReLU(),
        Linear(150, 100), ReLU(),
        Linear(100, 50), ReLU(),
        Linear(50, 1)
])
opt=Optimizer(X_train,y_train,model,lr=0.01,val=(X_val,y_val),betas=(0.5,0.5))
opt.fit(40)

  0%|          | 0/40 [00:00<?, ?it/s]

0. Train loss: 28850.023342, validation loss: 30569.992933
1. Train loss: 26608.318587, validation loss: 28245.809289
2. Train loss: 24550.597916, validation loss: 26105.619468
3. Train loss: 22043.216669, validation loss: 23492.286841
4. Train loss: 18914.178992, validation loss: 20221.188744
5. Train loss: 15419.609668, validation loss: 16547.808494
6. Train loss: 11759.160966, validation loss: 12662.463660
7. Train loss: 8393.893194, validation loss: 9020.671579
8. Train loss: 6102.273544, validation loss: 6399.026542
9. Train loss: 5875.959575, validation loss: 5816.776855
10. Train loss: 6242.119917, validation loss: 6085.205222
11. Train loss: 5585.835289, validation loss: 5612.449171
12. Train loss: 5534.040467, validation loss: 5763.644087
13. Train loss: 5468.631228, validation loss: 5715.549566
14. Train loss: 5277.807576, validation loss: 5401.393754
15. Train loss: 5228.885471, validation loss: 5324.259860
16. Train loss: 5127.192361, validation loss: 5358.732524
17. Train 

###Concat, SumM

In [167]:
# Concat
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[10, 11, 12]])
c1, c2 = Concat(0), Concat(1)
print(c1(a,b),'\n\n',c2(a,b.T))

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]] 

 [[ 1  2  3 10]
 [ 4  5  6 11]
 [ 7  8  9 12]]


In [179]:
print(c1.backward(np.random.rand(4, 3).round(1)), '\n\n', c2.backward(np.random.rand(3, 4).round(1)))

[array([[0.7, 0.3, 0.6],
       [0.4, 0.3, 0.7],
       [0.3, 0.5, 0.9]]), array([[0.9, 1. , 0.3]])] 

 [array([[0.5, 0.2, 0.3],
       [0.6, 0.2, 0.3],
       [1. , 0. , 0.9]]), array([[0.4],
       [0.9],
       [0.2]])]


In [184]:
# SumM
s = SumM()
s(a,b)

array([[11, 13, 15],
       [14, 16, 18],
       [17, 19, 21]])

In [185]:
s.backward(np.random.rand(3,3).round(1))

[array([[0.8, 0.5, 0.3],
        [0.3, 0.4, 0.2],
        [0.5, 0.7, 0.5]]),
 array([[0.8, 0.5, 0.3],
        [0.3, 0.4, 0.2],
        [0.5, 0.7, 0.5]])]