## Question 1


In [4]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
class Perceptron:
    def __init__(self, eta=0.01, n_iter=50, random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state
    def fit(self, X, y):
        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=X.shape[1])
        self.b_ = np.float64(0.)
        self.errors_ = []
        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_ += update * xi
                self.b_ += update
                errors += int(update != 0.0)
            self.errors_.append(errors)
        return self
    def net_input(self, X):
        return np.dot(X, self.w_) + self.b_
    def predict(self, X):
        return np.where(self.net_input(X) >= 0.0, 1, 0)
s='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df=pd.read_csv(s, header=None, encoding='utf-8')
y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', 0, 1)
X = df.iloc[0:100, [0, 2]].values
ppn = Perceptron(eta=0.001, n_iter=10)
ppn.fit(X, y)
print('Perceptron Loss:',ppn.errors_)
print('Number of updates in Perceptron:',ppn.n_iter-ppn.errors_.count(0))
margin=abs(np.min((np.dot(X, ppn.w_)+ppn.b_)/np.linalg.norm(ppn.w_)))
print('Perceptron Margin:',margin)
print('For Perceptron converged in weight ',ppn.w_,' & bias ',ppn.b_)
class AdalineGD:
    def __init__(self, eta=0.01, n_iter=50, random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state
    def fit(self, X, y):
        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=X.shape[1])
        self.b_ = np.float64(0.)
        self.losses_ = []
        for i in range(self.n_iter):
            net_input = self.net_input(X)
            output = self.activation(net_input)
            errors = (y - output)
            self.w_ += self.eta * 2.0 * X.T.dot(errors) / X.shape[0]
            self.b_ += self.eta * 2.0 * errors.mean()
            loss = (errors**2).mean()
            self.losses_.append(loss)
        return self
    def net_input(self, X):
        return np.dot(X, self.w_) + self.b_
    def activation(self, X):
        return X
    def predict(self, X):
        return np.where(self.activation(self.net_input(X))>= 0.5, 1, 0)
s='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df=pd.read_csv(s, header=None, encoding='utf-8')
y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', 0, 1)
X = df.iloc[0:100, [0, 2]].values
agd = AdalineGD(eta=0.001, n_iter=10)
agd.fit(X, y)
df = pd.DataFrame(agd.losses_)
print('AdalineGD Loss:',df)
print('Number of updates in AdalineGD:',agd.n_iter-agd.losses_.count(0))
margin=abs(np.min((np.dot(X, agd.w_)+agd.b_)/np.linalg.norm(agd.w_)))
print('AdalineGD Margin:',margin)
print('For AdalineGD converged in weight ',agd.w_,' & bias ',agd.b_)


Perceptron Loss: [4, 3, 2, 3, 2, 3, 1, 0, 0, 0]
Number of updates in Perceptron: 7
Perceptron Margin: 1.718363763129416
For Perceptron converged in weight  [-0.00175655  0.00538244]  & bias  -0.006
AdalineGD Loss:           0
0  0.434769
1  0.394333
2  0.360073
3  0.331027
4  0.306386
5  0.285465
6  0.267688
7  0.252565
8  0.239685
9  0.228699
Number of updates in AdalineGD: 10
AdalineGD Margin: 4.472517997318707
For AdalineGD converged in weight  [0.05138196 0.02315132]  & bias  0.005647942224158802


Loss function

In perceptron learning, Loss = $\sum_{n=1}^{n=100}{(y_i - y\hat{i})} $

In Adaline, Loss = $\frac{1}{n}\sum_{n=1}^{n=100}{(y_i - \hat{y}_i)^2} $

Number of updates

Number of updates = Number of iteration - Number of non-zero loss function. 
In perceptron number of nonzero loss function is 3, resulting number of updates = 10-3 =7

In case of Adaline, there is no nonzero loss function resulting number of update = 10-0=10

Margin

The margin refers to the distance between the decision boundary and the nearest data point. A larger margin typically indicates better generalization.
So, margin = $\min {\frac{W^Tx+b}{|W|}} $. 
In case of Perceptron, $ W=W_j+\Delta W_j=W_j+\eta(y_i-\hat{y}_i)x_j $. So, The Perceptron does not try to find the decision boundary that maximizes the margin.

In case of Adaline, $ W=W_j+\Delta W_j=W_j-\eta\frac{\partial L}{\partial W_j} $. As Adaline directly minimizes the error across all data points, the margin is higher. 