In [36]:
import numpy as np
from sklearn.datasets import load_iris

In [62]:
def create_data():
    iris = load_iris()
    X = iris.data   # First two features

    # Binary classification: class 0 vs others
    y = (iris.target == 0).astype(np.float32).reshape(-1, 1)
    
    return X, y

In [63]:
X,y = create_data()

In [64]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [65]:
def loss_fn(y_pred, y_true, theta, l1_lambda, l2_lambda):
    y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
    cross_entropy = -np.mean(y_true * np.log(y_pred) + (1-y_true) * np.log(1-y_pred))
    l1_loss = l1_lambda * np.sum(np.abs(theta[:-1]))
    l2_loss = l2_lambda * np.sum(theta[:-1]**2)
    return cross_entropy + l1_loss + l2_loss

In [66]:
def predict(X, theta):
    # return sigmoid(X@theta[:-1] + theta[-1])
    return sigmoid(np.dot(X, theta[:-1]) + theta[-1])

In [67]:
def calc_grad(X, y, theta, l1_lambda, l2_lambda):
    epsilon = 1e-6
    grad = np.zeros_like(theta)
    
    for i in range(len(theta)):
        theta_plus = theta.copy()
        theta_minus = theta.copy()
        
        theta_plus[i][0] += epsilon
        theta_minus[i][0] -= epsilon
        
        loss_plus = loss_fn(predict(X, theta_plus), y, theta_plus, l1_lambda, l2_lambda)
        loss_minus = loss_fn(predict(X, theta_minus), y, theta_minus, l1_lambda, l2_lambda)
        
        grad[i][0] = (loss_plus - loss_minus) / (2 * epsilon)
        
    return grad

In [68]:
def optimizer(X, y, theta, learning_rate, n_epoch, batch_size, l1_lambda=0.0, l2_lambda=0.0):
    n = len(X)

    for epoch in range(n_epoch):
        print(f"Epoch {epoch+1}")
        
        # Shuffle data
        indices = np.random.permutation(n)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        for i in range(0, n, batch_size):
            xi = X_shuffled[i:i+batch_size]
            yi = y_shuffled[i:i+batch_size]
            
            grad = calc_grad(xi, yi, theta, l1_lambda, l2_lambda)
            theta -= learning_rate * grad

        loss = loss_fn(predict(X, theta), y, theta, l1_lambda, l2_lambda)
        print(f"Epoch Loss: {loss:.4f}")
    
    return theta

In [69]:
learning_rate = 0.1
n_epoch = 5
batch_size = 16
l1_lambda = 0.01   # L1 regularization strength
l2_lambda = 0.01   # L2 regularization strength

X, y = create_data()

# Normalization
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

theta = np.random.randn(5, 1)  # (4 weights + 1 bias)

theta = optimizer(X, y, theta, learning_rate, n_epoch, batch_size, l1_lambda, l2_lambda)

print("\nFinal Theta:\n", theta)

# Evaluate model
preds = predict(X, theta)
preds_cls = (preds > 0.5).astype(int)
acc = np.mean(preds_cls == y)
print(f"Training Accuracy: {acc:.4f}")

Epoch 1
Epoch Loss: 0.5088
Epoch 2
Epoch Loss: 0.3834
Epoch 3
Epoch Loss: 0.3072
Epoch 4
Epoch Loss: 0.2596
Epoch 5
Epoch Loss: 0.2283

Final Theta:
 [[-0.01040935]
 [-0.55521137]
 [-2.09605162]
 [-0.96860695]
 [-1.21877507]]
Training Accuracy: 0.9733


In [None]:
# Multinomial Logistic Regression write code for this and explain

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# --- your functions from before --- 

def one_hot(y, num_classes):
    one_hot_y = np.zeros((y.size, num_classes))
    for i, target in enumerate(y):
        one_hot_y[i][target] = 1
    return one_hot_y

def softmax(z):
    z -= np.max(z, axis=1, keepdims=True)  # Stability trick
    exp_z = np.exp(z)
    return exp_z / exp_z.sum(axis=1, keepdims=True)

def compute_loss(y_true, y_pred, theta=None, l1_lambda=0.0, l2_lambda=0.0):
    m = y_true.shape[0]
    log_likelihood = -np.log(y_pred + 1e-15)
    data_loss = np.sum(y_true * log_likelihood) / m
    reg_loss = 0
    if theta is not None:
        reg_loss = l1_lambda * np.sum(np.abs(theta)) + l2_lambda * np.sum(theta ** 2)

    return data_loss + reg_loss

def predict(X, theta):
    y_pred = X @ theta[:, :-1].T + theta[:, -1]
    return softmax(y_pred)

def calc_grad(X, y, theta, l1_lambda=0.0, l2_lambda=0.0):
    epsilon = 1e-5  # slightly bigger epsilon, 1e-5 works better for numerical stability
    grad = np.zeros_like(theta)

    # Loop over all elements in theta
    for i in range(theta.shape[0]):       # num_classes
        for j in range(theta.shape[1]):    # num_features + 1 (including bias)
            theta_plus = theta.copy()
            theta_minus = theta.copy()

            theta_plus[i, j] += epsilon
            theta_minus[i, j] -= epsilon

            loss_plus = compute_loss(y, predict(X, theta_plus), theta_plus, l1_lambda, l2_lambda)
            loss_minus = compute_loss(y, predict(X, theta_minus), theta_minus, l1_lambda, l2_lambda)

            grad[i, j] = (loss_plus - loss_minus) / (2 * epsilon)

    return grad

def optimizer(X, y, theta, learning_rate, epochs, batch_size, l1_lambda=0.01, l2_lambda=0.01):
    n = X.shape[0]
    for epoch in range(epochs):
        # print(f"Epoch {epoch+1}/{epochs}")
        for i in range(0, n, batch_size):
            xi = X[i:i+batch_size]
            yi = y[i:i+batch_size]
            grad = calc_grad(xi, yi, theta, l1_lambda, l2_lambda)
            theta -= learning_rate * grad
        # print("Loss:", compute_loss(y, predict(X, theta)))
    return theta

# --- Now MAIN FUNCTION --- 

def main():
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target

    # One-hot encode the labels
    num_classes = len(np.unique(y))
    y_one_hot = one_hot(y, num_classes)

    # Split the dataset (optional but good practice)
    X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

    # Initialize theta: one row per class, (features + bias)
    num_features = X.shape[1]
    theta = np.random.randn(num_classes, num_features + 1) * 0.01

    # Train the model
    theta = optimizer(X_train, y_train, theta, 
                      learning_rate=0.01, 
                      epochs=10, 
                      batch_size=16, 
                      l1_lambda=0.001, 
                      l2_lambda=0.001)

    # Predict on test set
    y_pred = predict(X_test, theta)
    loss = compute_loss(y_test, y_pred)

    # Print results
    print("\nFinal Test Loss:", loss)
    print("\nPredicted class probabilities:\n", y_pred)
    print("\nPredicted classes:\n", np.argmax(y_pred, axis=1))
    print("\n True Classes:\n",  np.argmax(y_test, axis=1))

if __name__ == "__main__":
    main()


1.0819815199225056
1.0819991924761856
1.081981354147634
1.0819993577947997
1.0819924010874398
1.0819883108271018
1.0819919300793537
1.0819887816242808
1.0819882671725678
1.0819924445282876
1.0819895853137442
1.0819911270675906
1.0819922418212828
1.0819884701163671
1.0819860456566923
1.0819946662496962
1.0819886697215881
1.0819920419810916
1.0819906104048493
1.0819901012955229
1.0819999633284452
1.081980749020905
1.0819974719336196
1.0819832399913665
1.0819926211150213
1.0819880907823518
1.0819904677530903
1.0819902439486597
1.081992189972862
1.0819885217264682
1.130849013577036
1.130834823636922
1.1308426622454375
1.1308411743350324
1.1308522675267265
1.1308315692785804
1.1308457398363254
1.1308380965635856
1.1308426132686555
1.1308412231078653
1.130844120772402
1.1308397163942634
1.1308438200669415
1.1308400165018886
1.1308423537212893
1.1308414830571991
1.1308420818017146
1.130841754595345
1.1308421743429853
1.1308416620323118
1.1308326213827804
1.130851215701017
1.1308392725358916
1