## 1. Perceptron Supporting Code

## 1.1 Perceptron Learning Ts vs Js - Step Through Manually
Manually walk through the training procedure of Figure 1.4

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Manually code up examples
examples=np.array([[[1,1,1,-1],   # T Shifted Left
                    [-1,1,-1,-1],
                    [-1,1,-1,-1],
                    [-1,1,-1,-1]],
                   [[-1,1,1,1],   # T Shifted Right
                    [-1,-1,1,-1],
                    [-1,-1,1,-1],
                    [-1,-1,1,-1]],
                   [[-1,-1,1,-1], # J Shifted Left
                    [-1,-1,1,-1],
                    [1,-1,1,-1],
                    [1,1,1,-1]],
                   [[-1,-1,-1,1], # J Shifted Right
                    [-1,-1,-1,1],
                    [-1,1,-1,1],
                    [-1,1,1,1]]])

In [None]:
fig = plt.figure(0,(6,6))
for i in range(len(examples)):
    fig.add_subplot(2,2,i+1)
    plt.imshow(examples[i])

In [None]:
# Setup labels - we want our machine to output positive voltage to T shapes, 
# our first 2 examples are Ts - so we'll set these values to +1, our second 2 
# examples are Js - so we'll set these to -1s
y = np.array([1,1,-1,-1])

# Reshape each example into a row, and add a 17th column for the bias term
# Bias term is like a switch that is "always on" - it's an extra parameter that doesn't depend on our input and helps our model learn. 
X = np.hstack((examples.reshape(-1, 16), np.ones((len(y),1)))) 

In [None]:
X.shape, y.shape

In [None]:
X # 1 row for each example

In [None]:
# Initialize weights to zeros, this is equivalent to turning each knob to 12 o'clock
w = np.zeros(17)
lr = 1.0 # Learning rate

In [None]:
i = 1 # Start with index 1, converges a little faster than starting at index 0

# Compute perceptron output by taking dot product of example X and weights.
yhat = np.dot(X[i], w) 

In [None]:
yhat, y[i] # Machine outputs 0, but we want it to output +1 (Case 1)

In [None]:
# Update weight following perceptron learning rule.
# Adding our learning rate times our example is equivalent to turning up all our
# dials that are switched on, and turning down all our dials that are switched off
w = w + lr*X[i]

In [None]:
# By adding our learning rate times our example now makes our weights look just like our first example.
w[:16].reshape(4,4)

In [None]:
i += 1 # Increment our counter i
i

In [None]:
yhat = np.dot(X[i], w) # Compute perceptron output

In [None]:
yhat, y[i] # Machine outputs +, but we want it to output - (Case 2)

In [None]:
w = w - lr*X[i] # Machine output a +, but we wanted -, so *subtract* learning rate * examples

In [None]:
w[:16].reshape(4,4)

In [None]:
i += 1 # Increment our counter i
i

In [None]:
yhat = np.dot(X[i], w) # Compute perceptron output

In [None]:
yhat, y[i] # Machine outputs +, but we want it to output - (Case 2)

In [None]:
w = w - lr*X[i] # Machine output a +, but we wanted -, so subtract learning rate * examples

In [None]:
w[:16].reshape(4,4)

In [None]:
i = 0 # We've reached the end of our examples (index 3, so start over)

In [None]:
yhat = np.dot(X[i], w) # Compute perceptron output

In [None]:
yhat, y[i] # Machine outputs +, and we want a +, so do not update weights. 

In [None]:
# Cycle back through examples, print machine output and target output for each
for i in range(4):
    yhat=np.dot(X[i],w) # Compute perceptron output
    print(yhat, y[i])

Signs match in each case! Our perceptron is correctly classifying all examples. 

## 1.2 Perceptron Learning Ts vs Js - Step through in automated loop

In [None]:
#Manually Code up examples
examples = np.array([[[1,1,1,-1], # T Shifted Left
                    [-1,1,-1,-1],
                    [-1,1,-1,-1],
                    [-1,1,-1,-1]],
                   [[-1,1,1,1],   # T Shifted Right
                    [-1,-1,1,-1],
                    [-1,-1,1,-1],
                    [-1,-1,1,-1]],
                   [[-1,-1,1,-1], # J Shifted Left
                    [-1,-1,1,-1],
                    [1,-1,1,-1],
                    [1,1,1,-1]],
                   [[-1,-1,-1,1], # J Shifted Right
                    [-1,-1,-1,1],
                    [-1,1,-1,1],
                    [-1,1,1,1]]])

In [None]:
fig = plt.figure(0,(6,6))
for i in range(len(examples)):
    fig.add_subplot(2,2,i+1)
    plt.imshow(examples[i])

In [None]:
# Setup labels - we want our machine to output positive voltage to T shapes, 
# our first 2 examples are Ts - so we'll set these values to +1, our second to 
# examples are Js - so we'll set these to -1s
y = np.array([1,1,-1,-1])

# Reshape each example into a row, and add a 17th column for the bias term
X = np.hstack((examples.reshape(-1, 16), np.ones((len(y),1)))) 

# Initialized weights to zeros, this is equivalent to turning each knob to 12 o'clock
w = np.zeros(17)
lr = 1.0 # Learning rate

In [None]:
for i in range(1,10):   # 10 bc we're doing 10 training iterations
    yhat = np.dot(X[i%len(y)], w) # % bc we have 4 samples but 10 iterations
    if yhat<=0 and y[i%len(y)] > 0:      # Case 1
        print(f"output is {yhat} but we want it to be {y[i%len(y)]}, updating weights.")
        w = w + lr*X[i%len(y)] 
    elif yhat > 0 and y[i % len(y)]<=0:  # Case 2
        print(f"output is {yhat} but we want it to be {y[i%len(y)]}, updating weights.")
        w = w - lr*X[i%len(y)] 
    else: 
        print(f"output is {yhat}, which has the same sign as our target {y[i%len(y)]}, "
              f"machine is correct, not updating weights.")

## 1.3 Two input perceptron - solvable case (OR gate)
Replicates results of Figure 1.10, Task 1
(baby perceptron learning AND gate)

In [None]:
examples = np.array([[[-1,-1]],
             [[-1,1]],
             [[1,-1]],
             [[1,1]]])

y = np.array([-1,1,1,1]) # we want our machine to output + when either or both switches are on

# Reshape each example into a row, and add a 3rd column for the bias term
X = np.hstack((examples.reshape(-1, 2), np.ones((len(y),1)))) 

# Initialized weights to zeros, this is equivalent to turning each knob to 12 o'clock
w = np.zeros(3)
lr = 1.0 # Learning rate

In [None]:
X

In [None]:
w, y

In [None]:
for i in range(1, 12): # Starting at index 1 instead of 0 results are a little more clear this way.
    yhat = np.dot(X[i%len(y)],w) 
    print(f"step: {i}, current example: {X[i%len(y)][:2]}, current weights = {w}")
    if yhat<=0 and y[i%len(y)] > 0:    # Case 1
        print(f"output is {yhat} but we want it to be {y[i%len(y)]}, updating weights.")
        w = w + lr*X[i%len(y)] 
    elif yhat > 0 and y[i%len(y)]<=0:  # Case 2
        print(f"output is {yhat} but we want it to be {y[i%len(y)]}, updating weights.")
        w = w - lr*X[i%len(y)] 
    else: 
        print(f"output is {yhat}, which has the same sign as our target {y[i%len(y)]}, " 
              f"machine is correct, not updating weights.")

## 1.4 - Two input perceptron - unsolvable XOR case
Replicates Results of Figure 1.10, Task 2

In [None]:
examples = np.array([[[-1,-1]],
             [[-1,1]],
             [[1,-1]],
             [[1,1]]])

y = np.array([-1,1,1,-1]) # Machine should output + when either, but not both switches are on

# Reshape each example into a row, and add a 3rd column for the bias term
X = np.hstack((examples.reshape(-1, 2), np.ones((len(y),1)))) 

# Initialized weights to zeros, this is equivalent to turning each knob to 12 o'clock
w = np.zeros(3)
lr = 1.0 #Learning rate

In [None]:
X

In [None]:
w, y

In [None]:
for i in range(1, 14): # Starting at index 1, instead of 0, results are a little more clear this way.
    yhat = np.dot(X[i%len(y)], w) 
    print(f"step: {i}, current example: {X[i%len(y)][:2]}, current weights = {w}")
    if yhat<=0 and y[i%len(y)] > 0: 
        print(f"output is {yhat} but we want it to be {y[i%len(y)]}, updating weights.")
        w = w + lr*X[i%len(y)] 
    elif yhat > 0 and y[i%len(y)]<=0: 
        print(f"output is {yhat} but we want it to be {y[i%len(y)]}, updating weights.")
        w = w - lr*X[i%len(y)] 
    else: 
        print(f"output is {yhat}, which has the same sign as our target {y[i%len(y)]}, \
              machine is correct, not updating weights.")

Note that our weights are stuck in a loop!

## 1.6 Compute Perceptron Error Across a Range of Values
Reproduces bowl-shaped error surface in Figure 1.24

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Linearly-separable case
X = np.array([[-1, -1], [-1, 1], [1, -1], [1, 1]])
y = np.array([[-1], [1], [1], [1]])  # AND operation
w0_range = np.arange(-1.9, 2.0, 0.2) 
w1_range = np.arange(-1.9, 2.0, 0.2)
b = 1   # Bias term

# Initialize lists to store results
w0_points = []
w1_points = []
error_points = []

# Compute error for each weight combination
for w0 in w0_range:
    for w1 in w1_range:
        yhat = X[:,0]*w0 + X[:,1]*w1 + b  # Compute all 4 yhats at once
        error = np.mean((y.ravel() - yhat)**2)  # Mean Squared Error
        
        # Store the results
        w0_points.append(w0)
        w1_points.append(w1)
        error_points.append(error)

# Convert to numpy arrays
w0_points = np.array(w0_points)
w1_points = np.array(w1_points)
error_points = np.array(error_points)

# Create 3D scatter plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Create scatter plot with color mapping
scatter = ax.scatter(w0_points, w1_points, error_points, 
                    c = error_points, cmap = 'viridis',  # color by error value
                    alpha = 0.6, s = 20)

# Add labels and title
ax.set_xlabel('Weight w0')
ax.set_ylabel('Weight w1')
ax.set_zlabel('Mean Squared Error')
ax.set_title('Perceptron Error Surface\n(Linearly Separable Case)')

## 1.7 Train Small Network to Solve XOR Using LMS

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# XOR dataset
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)

# Simple 2-layer network: 2 inputs -> 2 hidden -> 1 output (similar to Fig. 1.30)
class XORNet(nn.Module):
    def __init__(self):                   # Define network layers
        super(XORNet, self).__init__()    # Initialize parent class
        self.hidden = nn.Linear(2, 2)     # Hidden layer with 2 neurons
        self.output = nn.Linear(2, 1)     # Output layer
        self.sigmoid = nn.Sigmoid()       # Sigmoid activation function
    
    def forward(self, x):                 # Define feed forward pass
        x = self.sigmoid(self.hidden(x))  # Hidden layer with activation
        x = self.sigmoid(self.output(x))  # Output layer with activation
        return x

# Initialize network, loss, and optimizer
# Weights are chosen randomly, does not always converge - networks with more
# hidden units will converge more often
model = XORNet()

# Same squared error that Widrow and Hoff used, just taking the average across all 4 examples
# This is known as "batch" or "minibatch" gradient descent.
criterion = nn.MSELoss()

# Using the Adam optimizer here instead of vanilla SGD, SGD gets stuck when model is only 2 neurons wide. 
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training loop
for epoch in range(4000):         # Train for 4000 epochs
    optimizer.zero_grad()         # Initialize the gradients from the previous epoch
    outputs = model(X)            # Compute model outputs
    loss = criterion(outputs, y)  # Compute LMS loss
    loss.backward()               # Compute gradients
    optimizer.step()              # Update weights
    
    if epoch % 1000 == 0:         # Print loss every 1000 epochs
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# Test the network
print("\nResults:")
with torch.no_grad():
    for i in range(len(X)):        # Test each example
        output = model(X[i:i+1])   # Get model output
        print(f"Input: {X[i].numpy()}, Target: {y[i].item()}, Output: {output.item():.4f}")