In [None]:
import matplotlib.pyplot as plt

def compute_y(x, W, bias):
  # dreapta de decizie: [x, y] * [W[0], W[1]] + b = 0
  # => x * W[0] + y * W[1] + b = 0
  # => y = (-x * W[0] - b) / W[1]
  return (-x * W[0] - bias) / (W[1] + 1e-10) # Adaugam 1e-10 ca să evitam impartirea la zero daca W[1] este exact 0.

def plot_decision_boundary(X, y , W, b, current_x, current_y):

  # Alegem doua puncte ca sa putem desena o linie intre ele
  x1 = -0.5
  y1 = compute_y(x1, W, b)
  x2 = 0.5
  y2 = compute_y(x2, W, b)

  # Stergem continutul ferestrei actuale (clf = clear figure)
  plt.clf()

  # Alegem culoarea (pentru clasa -1 o sa avem albastru, altfel rosu)
  color = 'r'
  if(current_y == -1):
    color = 'b'

  # Setam limitele plotului (mini-zoom)
  plt.ylim((-1, 2))
  plt.xlim((-1, 2))

  # Plotam toate punctele de train, separate pe clase:
  plt.plot(X[y == -1, 0], X[y == -1, 1], 'b+')
  plt.plot(X[y == 1, 0], X[y == 1, 1], 'r+')

  # Plotam exemplul curent - cel pe care modelul il procesează acum
  plt.plot(current_x[0], current_x[1], color+'s')

  # Desenam dreapta de decizie a modelului
  plt.plot([x1, x2] ,[y1, y2], 'black')
  plt.show(block=False)
  plt.pause(0.3)

In [None]:
import torch
torch.cuda.is_available()

In [None]:
!nvidia-smi

#### Exercitiul 2

Implementarea algoritmului widrow-hoff:
  - Initializam weight-urile cu 0
  - Amestecam datele de antrenare
  - Pentru fiecare exemplu din setul de antrenare updatam weight-urile perceptronului cu ajutorul gradientilor.
  - Repetam ultimii 2 pasi pentru un anumit numar de epoci.

In [97]:
import numpy as np

X = [2, 3, 3, 4]

W = [[5, 4, 6, 7]]

print(np.subtract(W[0], np.multiply(4, X)))

print(round(-0.6))

[-3 -8 -6 -9]
-1


In [126]:
import numpy as np
from sklearn.utils import shuffle

def widrow_hoff(X, Y, num_epochs, learning_rate):
  '''
  Params:
  X = Train inputs
  Y = Train Labels
  num_epochs = number of epochs the neural network will train for

  This method initializes and trains a neural network with two weights using the widrow-hoff method.
  '''
  X = np.array(X)
  Y = np.array(Y)
  
  features_count = len(X[0])
  input_count = len(X)

  W = np.ones((1, features_count))
  bias = 0

  for i in range(1, num_epochs + 1):
    X, Y = shuffle(X, Y, random_state=0)
    accuracy = []

    for t in range(0, input_count):
      temp_Y = np.add(X[t] @ W.T, bias)
      curr_y = temp_Y[0]
      loss = ((curr_y - Y[t]) ** 2) / 2
      accuracy.append(np.sign(curr_y) == Y[t])
      print(curr_y, Y[t])

      W[0] = np.subtract(W[0], np.multiply(learning_rate * (curr_y - Y[t]), X[t]))
      bias -= learning_rate * (curr_y - Y[t])
      
      # plot_decision_boundary(X, Y, W[0], bias, X[t], curr_y)
    
    print(f"Accuracy: {np.mean(accuracy)}")
    

In [127]:
x = np.array([[0,0], [0,1], [1,0], [1,1]])
y = np.array([-1, 1, 1, 1])
widrow_hoff(x, y, 70, 0.1)

# Does it work ?

1.0 1
2.0 1
0.8 1
-0.08000000000000002 -1
Accuracy: 1.0
0.748 1
-0.1468 -1
1.61308 1
0.545264 1
Accuracy: 1.0
1.5201032 1
0.5321905599999999 1
-0.25318377599999997 -1
0.5040162816 1
Accuracy: 1.0
-0.27826702655999996 -1
0.531039727936 1
0.5753919273024 1
1.5886896208883199 1
Accuracy: 1.0
0.542575617664256 1
1.5035676110889729 1
0.4945835814566758 1
-0.2740251325376625 -1
Accuracy: 1.0
0.5230693784191068 1
-0.29892955712580693 -1
1.4062622047534712 1
0.4076247039416845 1
Accuracy: 1.0
1.4028586025390928 1
0.44552804264552903 1
-0.335263956801204 -1
0.43473541801075244 1
Accuracy: 1.0
-0.34521110292215884 -1
0.4823094447008178 1
0.5327654538176025 1
1.5709808559147609 1
Accuracy: 1.0
0.5120161918711298 1
1.4972833607661067 1
0.46771654785560746 1
-0.3169971781225455 -1
Accuracy: 1.0
0.5058729560967405 1
-0.33588475591996503 -1
1.4186686451500563 1
0.39435179532268744 1
Accuracy: 1.0
1.4141976925405018 1
0.4326418977500496 1
-0.368282283404298 -1
0.4258424319644334 1
Accuracy: 1.0
-0.374

#### Exercitiul 3

Pentru exercitiul 3 putem vedea ca perceptronul nu mai este capabil sa invete o functie de decizie suficient de complexa pentru a discrimina setul de antrenare.

In [128]:
x = np.array([[0,0], [0,1], [1,0], [1,1]])
y = np.array([-1, 1, 1, -1])
widrow_hoff(x, y, 70, 0.1)

# Is it true ? ^

1.0 1
2.0 -1
0.3999999999999999 1
-0.24000000000000005 -1
Accuracy: 0.75
0.44399999999999995 1
-0.2604000000000001 -1
1.1812399999999998 -1
-0.07060800000000012 1
Accuracy: 0.5
0.7409895999999998 -1
-0.20468432000000014 1
-0.49905372800000003 -1
-0.12577131520000007 1
Accuracy: 0.25
-0.43657122368 -1
0.04304007020799999 1
0.1380881636671999 1
0.9421206954329601 -1
Accuracy: 0.75
-0.07795360815283214 1
0.5750752084336382 -1
-0.2750205801583565 1
-0.427449096255061 -1
Accuracy: 0.25
-0.07727155450117901 1
-0.376977031179437 -1
0.45345368557890375 -1
-0.3523968391153708 1
Accuracy: 0.25
0.28789694772830676 -1
-0.3394968608379581 1
-0.4442250213968815 -1
-0.25877779500942066 1
Accuracy: 0.25
-0.3739247397562513 -1
-0.06962976203191135 1
0.04305824314908002 1
0.7083120744711698 -1
Accuracy: 0.5
-0.10721582037497002 1
0.4172616162048129 -1
-0.2744027900381367 1
-0.30827062191863064 -1
Accuracy: 0.25
-0.08869516983864623 1
-0.268574042742903 -1
0.322387189784879 -1
-0.3397081550440828 1
Accur

#### Exercitiul 4

Definim in prima faza functiile de activare ale retelei, tanh si sigmoid si derivata tanh.

In [129]:
def tanh(x):
    return np.tanh(x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def derivative_tanh(x):
    return 1 - tanh(x) ** 2

Urmatoarea functie trece un input(X) prin retea si salveaza rezultatele intermediare:
- z_1 este rezultatul celui de-al 2-lea strat inainte de functia de activare
- a_1 este rezultatul functiei de activare(deci a_1=tanh(z_1)).
- z_2 este rezultatul ultimului strat inainte de functia de activare
- a_2 este rezultatul dupa aceasta (a_2=sigmoid(z_2))

In [140]:
def forward(X, W_1, b_1, W_2, b_2):

  z_1 = X @ W_1 + b_1
  a_1 = tanh(z_1)
  z_2 = a_1 @ W_2 + b_2
  a_2 = sigmoid(z_2)

  return z_1, a_1, z_2, a_2

In [135]:
def plot_decision(X_, W_1, W_2, b_1, b_2):

  # Sterge continutul ferestrei
  plt.clf()

  # Setam limitele pentru axelor X si Y
  plt.ylim((-0.5, 1.5))
  plt.xlim((-0.5, 1.5))

  # Generam multe puncte aleatoare in spatiul 2D
  xx = np.random.normal(0, 1, (100000))
  yy = np.random.normal(0, 1, (100000))

  # Combinam datele random cu cele reale (X_)
  X = np.array([xx, yy]).transpose()
  X = np.concatenate((X, X_))

  # Forward pass: rulam toate punctele prin retea sa vedem in ce clasa le clasifica
  _, _, _, output = forward(X, W_1, b_1, W_2, b_2)

  # Transformam output-ul in etichete binare (0 sau 1)
  y = np.squeeze(np.round(output)) # squeeze() elimina dimensiuni redundante (ex: (100000, 1) devine (100000,))

  # Punctele din clasa 0 (cu albastru) si din clasa 1 (cu rosu)
  plt.plot(X[y == 0, 0], X[y == 0, 1], 'b+')
  plt.plot(X[y == 1, 0], X[y == 1, 1], 'r+')

  plt.show(block=False)
  plt.pause(0.1)

Functia urmatoare se foloseste de rezultatele intermediare salvate dupa apelul functiei forward pentru a calcula derivata functiei de pierdere in functie de weight-urile retelei (w_1,b_1,w_2,b_2).

Formulele aplicate in functia de mai jos sunt obtinute prin chain rule:

h(x) = f(g(x))
h'(x) = g'(x)*f'(g(x))

In [144]:
def backward(a_1, a_2, z_1, W_2, X, Y, num_samples):
  """
  Params:

  X = inputs = datele de intrare
  Y = labels/ground-truth
  num_samples = batch_size (numărul de exemple)

  z_1 = hidden_input      (inputul in stratul ascuns, inainte de tanh: z1 = w1 * X + b1)
  a_1 = hidden_activation (iesirea stratului ascuns dupa aplicarea activarii tanh)

  W_2 = output_weights    (ponderile intre stratul ascuns si cel de iesire)
  a_2 = y_hat             (predictia finala a retelei dupa aplicarea functiei sigmoid)
  """

  dz_2 = a_2 - y # derivata functiei de pierdere (logistic loss) in functie de z
  dw_2 = (a_1.T @ dz_2) / num_samples
  # der(L/w_2) = der(L/z_2) * der(dz_2/w_2) = dz_2 * der((a_1 * W_2 + b_2)/ W_2)
  db_2 = sum(dz_2) / num_samples
  # der(L/b_2) = der(L/z_2) * der(z_2/b_2) = dz_2 * der((a_1 * W_2 + b_2)/ b_2)
  # primul strat
  da_1 = dz_2 @ W_2.T
  # der(L/a_1) = der(L/z_2) * der(z_2/a_1) = dz_2 * der((a_1 * W_2 + b_2)/ a_1)
  dz_1 = da_1 * derivative_tanh(z_1)
  # der(L/z_1) = der(L/a_1) * der(a_1/z1) = da_1 .* der((tanh(z_1))/ z_1)

  dw_1 = X.T @ dz_1 / num_samples
  # der(L/w_1) = der(L/z_1) * der(z_1/w_1) = dz_1 * der((X * W_1 + b_1)/ W_1)
  db_1 = sum(dz_1) / num_samples
  # der(L/b_1) = der(L/z_1) * der(z_1/b_1) = dz_1 * der((X * W_1 + b_1)/ b_1)
  return dw_1, db_1, dw_2, db_2

Urmeaza initializarea weight-urilor retelei, care se face random

In [134]:
import numpy.random as random

num_hidden_neurons = 5
miu = 0
sigma = 1

W_1 = random.normal(miu, sigma, (2, num_hidden_neurons))
b_1 = np.zeros(num_hidden_neurons)
W_2 = random.normal(miu, sigma, (num_hidden_neurons, 1))
b_2 = np.zeros(1)

Algoritmul gradient descent:
- Folosim functia forward pentru a calcula valorile intermediare (z_1, a_1, z_2 a_2)
- Cu functia backward calculam gradientii
- Updatam weight-urile cu gradientii obtinuti si learning_rate-ul stabilit

In [None]:
# Datele de train pentru XOR
x = np.array([[0,0], [0,1], [1,0], [1,1]])
y = np.array([[0], [1], [1], [0]]) # Folosim forma de matrice coloana pentru a putea matematic sa ne potrivim cu forma predicției a_2, care are si ea formă de coloana ([batch_size × 1] = [4 x 1])

# Hiperparametri
epochs = 70
learning_rate = 0.5

for i in range(epochs):

  # Afișăm dreapta de decizie curenta
  plot_decision(x, W_1, W_2, b_1, b_2)

  z_1, a_1, z_2, a_2 = forward(x, W_1, b_1, W_2, b_2)

  loss = (-y * np.log(a_2) - (1 - y) * np.log(1 - a_2)).mean()
  accuracy = (np.round(a_2) == y).mean()
  print(accuracy)
  
  dw_1, db_1, dw_2, db_2 = backward(a_1, a_2, z_1, W_2, x, y, len(x))

  W_1 -= learning_rate * dw_1
  b_1 -= learning_rate * db_1
  W_2 -= learning_rate * dw_2
  b_2 -= learning_rate * db_2