# Neural Network Prediction Model using IRIS Dataset

This notebook demonstrates a basic Multi-Layer Perceptron (MLP) with three layers using the IRIS dataset for classification. The model uses no activation functions (linear layers only).

## Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

## Load and Prepare Data

In [None]:
# Load IRIS dataset
iris = load_iris()
X = iris.data
y = iris.target

print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Feature names: {iris.feature_names}")
print(f"Target names: {iris.target_names}")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## Define MLP Model (3 Layers, No Activation Functions)

In [None]:
class MLP:
    """Multi-Layer Perceptron with 3 layers and no activation functions."""
    
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size, seed=42):
        """Initialize the MLP with random weights using Xavier initialization."""
        rng = np.random.RandomState(seed)
        
        # Layer 1: Input -> Hidden1 (Xavier initialization)
        self.W1 = rng.randn(input_size, hidden1_size) * np.sqrt(2.0 / (input_size + hidden1_size))
        self.b1 = np.zeros((1, hidden1_size))
        
        # Layer 2: Hidden1 -> Hidden2 (Xavier initialization)
        self.W2 = rng.randn(hidden1_size, hidden2_size) * np.sqrt(2.0 / (hidden1_size + hidden2_size))
        self.b2 = np.zeros((1, hidden2_size))
        
        # Layer 3: Hidden2 -> Output (Xavier initialization)
        self.W3 = rng.randn(hidden2_size, output_size) * np.sqrt(2.0 / (hidden2_size + output_size))
        self.b3 = np.zeros((1, output_size))
    
    def forward(self, X):
        """Forward pass through all layers (no activation functions)."""
        # Layer 1
        self.z1 = np.dot(X, self.W1) + self.b1
        
        # Layer 2
        self.z2 = np.dot(self.z1, self.W2) + self.b2
        
        # Layer 3 (Output)
        self.z3 = np.dot(self.z2, self.W3) + self.b3
        
        # Softmax for output probabilities
        exp_z3 = np.exp(self.z3 - np.max(self.z3, axis=1, keepdims=True))
        self.output = exp_z3 / np.sum(exp_z3, axis=1, keepdims=True)
        
        return self.output
    
    def backward(self, X, y, learning_rate):
        """Backward pass to compute gradients and update weights."""
        m = X.shape[0]
        
        # Output layer gradient
        dz3 = self.output - y
        dW3 = np.dot(self.z2.T, dz3) / m
        db3 = np.sum(dz3, axis=0, keepdims=True) / m
        
        # Hidden layer 2 gradient
        dz2 = np.dot(dz3, self.W3.T)
        dW2 = np.dot(self.z1.T, dz2) / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        # Hidden layer 1 gradient
        dz1 = np.dot(dz2, self.W2.T)
        dW1 = np.dot(X.T, dz1) / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        # Update weights and biases
        self.W3 -= learning_rate * dW3
        self.b3 -= learning_rate * db3
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
    
    def compute_loss(self, y_true, y_pred):
        """Compute cross-entropy loss."""
        m = y_true.shape[0]
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        loss = -np.sum(y_true * np.log(y_pred)) / m
        return loss
    
    def train(self, X, y, epochs, learning_rate, verbose=True):
        """Train the model."""
        losses = []
        
        for epoch in range(epochs):
            # Forward pass
            y_pred = self.forward(X)
            
            # Compute loss
            loss = self.compute_loss(y, y_pred)
            losses.append(loss)
            
            # Backward pass
            self.backward(X, y, learning_rate)
            
            if verbose and (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
        
        return losses
    
    def predict(self, X):
        """Make predictions."""
        probabilities = self.forward(X)
        return np.argmax(probabilities, axis=1)

## Train the Model

In [None]:
# Initialize MLP with 3 layers
# Input: 4 features, Hidden1: 8 neurons, Hidden2: 6 neurons, Output: 3 classes
model = MLP(input_size=4, hidden1_size=8, hidden2_size=6, output_size=3)

# Train the model
print("Training the MLP...\n")
losses = model.train(X_train, y_train_encoded, epochs=1000, learning_rate=0.1)

## Evaluate the Model

In [None]:
# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

## Visualize Training Loss

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.title('Training Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

## Summary

This notebook implemented a 3-layer MLP without activation functions for the IRIS classification task:

- **Layer 1**: Input (4 features) → Hidden (8 neurons)
- **Layer 2**: Hidden (8 neurons) → Hidden (6 neurons)
- **Layer 3**: Hidden (6 neurons) → Output (3 classes)

Note: Without activation functions between layers, the network is essentially a linear model, as the composition of linear transformations is still linear. However, the softmax function is applied at the output for probability distribution.