In [5]:
import pandas as pd
import numpy as np 

In [7]:
mdf=pd.read_csv("Assignment-10/MNIST_data/train.csv")
print(mdf.head())
mdf.shape

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

(42000, 785)

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import io

# Suppress warnings (e.g., from log(0)) for cleaner output
import warnings
warnings.filterwarnings('ignore')

################################################################################
#
# PART 0: EVALUATION METRICS (FROM SCRATCH)
#
# These functions will be used for both Part 1 and Part 2.
#
################################################################################

def accuracy_score(y_true, y_pred):
    """
    Computes accuracy from scratch.
    """
    correct_predictions = np.sum(y_true == y_pred)
    total_samples = len(y_true)
    return correct_predictions / total_samples

def precision_recall_f1_score(y_true, y_pred, average='macro'):
    """
    Computes Precision, Recall, and F1-Score from scratch.
    
    Supports 'macro' averaging for multi-class (and binary) problems.
    """
    # Get all unique classes from true labels
    classes = np.unique(y_true)
    
    # Dictionaries to store metrics for each class
    precisions = {}
    recalls = {}
    f1_scores = {}
    
    for c in classes:
        # True Positives (TP): True label is c, Predicted label is c
        tp = np.sum((y_true == c) & (y_pred == c))
        
        # False Positives (FP): True label is NOT c, Predicted label is c
        fp = np.sum((y_true != c) & (y_pred == c))
        
        # False Negatives (FN): True label is c, Predicted label is NOT c
        fn = np.sum((y_true == c) & (y_pred != c))
        
        # Calculate precision, recall, and f1 for class c
        precision_c = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_c = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        if (precision_c + recall_c) > 0:
            f1_c = 2 * (precision_c * recall_c) / (precision_c + recall_c)
        else:
            f1_c = 0
            
        precisions[c] = precision_c
        recalls[c] = recall_c
        f1_scores[c] = f1_c
        
    if average == 'macro':
        # Calculate the mean of the metrics across all classes
        avg_precision = np.mean(list(precisions.values()))
        avg_recall = np.mean(list(recalls.values()))
        avg_f1 = np.mean(list(f1_scores.values()))
        
        return avg_precision, avg_recall, avg_f1
    else:
        # We could implement 'micro' or 'weighted' here, but 'macro' is sufficient
        raise ValueError("Only 'macro' average is supported.")

def print_evaluation_metrics(y_true, y_pred, title="Evaluation Metrics"):
    """
    Helper function to print all metrics.
    """
    print(f"\n--- {title} ---")
    
    # Note: The assignment PDF listed 'Accuracy' twice.
    # We will compute Accuracy, Precision, Recall, and F1-Score.
    
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1 = precision_recall_f1_score(y_true, y_pred, average='macro')
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision (Macro): {precision:.4f}")
    print(f"Recall (Macro):    {recall:.4f}")
    print(f"F1-Score (Macro):  {f1:.4f}")
    print("---------------------------------")


################################################################################
#
# PART 1: LOGISTIC (SOFTMAX) REGRESSION ON MNIST
#
################################################################################

class SoftmaxRegression:
    """
    Implements Softmax Regression (Multinomial Logistic Regression) from scratch.
    
    This is for multi-class classification (e.g., MNIST digits 0-9).
    """
    def __init__(self, learning_rate=0.1, n_iterations=300, random_state=42):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        np.random.seed(random_state) # for reproducible weights
        print(f"SoftmaxRegression initialized (lr={learning_rate}, iterations={n_iterations})")

    def _one_hot(self, y, n_classes):
        """Helper to one-hot encode the target labels."""
        # Creates a matrix of zeros with shape (n_samples, n_classes)
        # and sets the column corresponding to the class label to 1.
        y_hot = np.zeros((y.shape[0], n_classes))
        y_hot[np.arange(y.shape[0]), y] = 1
        return y_hot

    def _softmax(self, z):
        """Computes the softmax activation function."""
        # Subtract max(z) for numerical stability (prevents overflow)
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def _cross_entropy_loss(self, y_true_hot, y_pred_proba):
        """Computes the categorical cross-entropy loss."""
        # Add a small epsilon to avoid log(0)
        epsilon = 1e-9
        n_samples = y_true_hot.shape[0]
        loss = - (1 / n_samples) * np.sum(y_true_hot * np.log(y_pred_proba + epsilon))
        return loss

    def fit(self, X, y):
        """Trains the model using gradient descent."""
        print("Starting model training...")
        
        # 1. Initialize parameters
        n_samples, n_features = X.shape
        self.n_classes = len(np.unique(y))
        
        # Initialize weights (n_features, n_classes) and bias (1, n_classes)
        self.weights = np.random.randn(n_features, self.n_classes) * 0.01
        self.bias = np.zeros((1, self.n_classes))
        
        # 2. One-hot encode the true labels
        y_true_hot = self._one_hot(y, self.n_classes)
        
        # 3. Gradient Descent
        for i in range(self.n_iterations):
            # Calculate scores (logits)
            z = X.dot(self.weights) + self.bias
            
            # Calculate predicted probabilities
            y_pred_proba = self._softmax(z)
            
            # Calculate the gradient of the loss w.r.t. scores (z)
            # Gradient = (Predicted Probas - True Labels (hot))
            dz = y_pred_proba - y_true_hot
            
            # Calculate gradients for weights and bias
            # dW = (1/N) * X.T dot dz
            # db = (1/N) * sum(dz)
            dw = (1 / n_samples) * X.T.dot(dz)
            db = (1 / n_samples) * np.sum(dz, axis=0, keepdims=True)
            
            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            if (i + 1) % 50 == 0:
                loss = self._cross_entropy_loss(y_true_hot, y_pred_proba)
                print(f"  Iteration {i+1}/{self.n_iterations}, Loss: {loss:.4f}")
        
        print("Training complete.")
        self.model_params = {'weights': self.weights, 'bias': self.bias}

    def predict_proba(self, X):
        """Makes probability predictions for new data."""
        if self.weights is None or self.bias is None:
            raise Exception("Model not trained yet. Call fit() first.")
        z = X.dot(self.weights) + self.bias
        return self._softmax(z)

    def predict(self, X):
        """Makes class predictions for new data."""
        probabilities = self.predict_proba(X)
        # Return the class with the highest probability
        return np.argmax(probabilities, axis=1)

# --- Q1: Required Helper Functions ---

def convert_array_to_image(arr, display=True):
    """
    Converts a 784-element array into a 28x28 image and optionally displays it.
    """
    # Reshape the 1D array (784) into a 2D array (28, 28)
    image_array = arr.reshape(28, 28)
    
    if display:
        plt.imshow(image_array, cmap='gray')
        plt.title("Converted Image")
        plt.axis('off')
        # Save the plot to a file
        plt.savefig("converted_image.png")
        print("Saved converted_image.png")
        plt.close() # Close the plot to avoid showing it inline
    
    # Return the 2D array, which can be considered the "image"
    return image_array

def predict_single_image(array_784, model):
    """
    Takes a 784-array and a trained model, and returns
    an image (2D array) and its predicted label.
    """
    print("Predicting for a single image...")
    
    # 1. Normalize the pixel data (as done during training)
    arr_normalized = array_784 / 255.0
    
    # 2. Reshape for the model (1 sample, 784 features)
    arr_reshaped = arr_normalized.reshape(1, 784)
    
    # 3. Get the predicted label from the model
    predicted_label = model.predict(arr_reshaped)[0]
    
    # 4. Convert the original array to an image (2D array)
    # We use the original non-normalized array for visualization
    image_2d = array_784.reshape(28, 28)
    
    print(f"Predicted Label: {predicted_label}")
    
    # Display the image with its prediction
    plt.imshow(image_2d, cmap='gray')
    plt.title(f"Predicted Label: {predicted_label}")
    plt.axis('off')
    # Save the plot to a file
    plt.savefig(f"predicted_image_label_{predicted_label}.png")
    print(f"Saved predicted_image_label_{predicted_label}.png")
    plt.close() # Close the plot to avoid showing it inline
    
    return image_2d, predicted_label

# --- Q1: Main Execution ---

def run_part_1():
    print("===========================================")
    print("         Running Part 1: MNIST             ")
    print("===========================================")
    
    # 1. Load Data
    print("Loading data...")
    try:
        # Load from uploaded files
        train_df = pd.read_csv("Assignment-10/MNIST_data/train.csv")
        test_df = pd.read_csv("Assignment-10/MNIST_data/test.csv")
        print("train.csv and test.csv loaded.")
    except FileNotFoundError:
        print("Error: train.csv or test.csv not found.")
        return

    # 2. Prepare Data
    print("Preparing data...")
    
    # Separate labels (y) from features (X) in the training data
    y_train_full = train_df['label'].values
    X_train_full = train_df.drop('label', axis=1).values
    
    # The provided test.csv has no labels.
    X_test_final = test_df.values

    # Normalize pixel values (from 0-255 to 0-1)
    X_train_full = X_train_full / 255.0
    X_test_final = X_test_final / 255.0

    # 3. Create Validation Split
    # We must split train.csv to get metrics, as test.csv has no labels.
    # We use a fixed random_state for reproducible results.
    print("Creating 80/20 train/validation split...")
    indices = np.arange(X_train_full.shape[0])
    np.random.seed(42) # for reproducibility
    np.random.shuffle(indices)
    
    split_point = int(0.8 * len(indices))
    train_idx, val_idx = indices[:split_point], indices[split_point:]
    
    X_train, X_val = X_train_full[train_idx], X_train_full[val_idx]
    y_train, y_val = y_train_full[train_idx], y_train_full[val_idx]

    print(f"Training set size:   {X_train.shape[0]} samples")
    print(f"Validation set size: {X_val.shape[0]} samples")

    # 4. Train the Model
    # Note: Training from scratch is slow. 300 iterations is a good compromise.
    # For higher accuracy, increase n_iterations (e.g., to 1000).
    model = SoftmaxRegression(learning_rate=0.1, n_iterations=300)
    model.fit(X_train, y_train)

    # 5. Compute Evaluation Metrics on Validation Set
    print("Computing metrics on validation set...")
    y_pred_val = model.predict(X_val)
    print_evaluation_metrics(y_val, y_pred_val, title="Q1: Softmax Regression Metrics (Validation Set)")

    # 6. Predict on the 'test.csv' file
    print("Predicting labels for test.csv...")
    final_predictions = model.predict(X_test_final)
    print(f"First 10 predictions for test.csv: {final_predictions[:10]}")

    # 7. Demonstrate required helper functions
    print("\n--- Demonstrating Helper Functions (as required) ---")
    
    # Pick a sample from the test set to demonstrate
    sample_index = 5
    sample_array = test_df.iloc[sample_index].values
    
    # a) Demonstrate convert_array_to_image
    print("\nDemonstrating convert_array_to_image...")
    _ = convert_array_to_image(sample_array, display=True)
    
    # b) Demonstrate predict_single_image
    print("\nDemonstrating predict_single_image...")
    # Get the already trained model
    trained_model = model
    # Use the raw array (non-normalized) as input, function will handle it
    image, label = predict_single_image(sample_array, trained_model)
    print(f"Function returned: label={label}, image_shape={image.shape}")


################################################################################
#
# PART 2: NAIVE BAYES ON BANK DATA
#
################################################################################

class MixedNaiveBayes:
    """
    Implements Naive Bayes from scratch.
    
    Handles mixed data types:
    - Gaussian distribution for numerical features.
    - Categorical distribution for categorical features.
    """
    def __init__(self, laplace=1):
        self.laplace = laplace # for Laplace/additive smoothing
        self.classes = None
        self.class_priors = {}
        self.likelihoods = {} # For categorical features
        self.gaussian_params = {} # For numerical features
        self.categorical_cols = None
        self.numerical_cols = None
        print(f"MixedNaiveBayes initialized (Laplace smoothing alpha={laplace})")

    def _gaussian_pdf(self, x, mean, std):
        """Calculates the Gaussian Probability Density Function."""
        # Add a small epsilon to std to prevent division by zero
        epsilon = 1e-9
        std = std + epsilon
        exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
        return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

    def fit(self, X, y):
        """Trains the Naive Bayes model."""
        print("Starting model training...")
        self.classes = np.unique(y)
        n_samples = X.shape[0]
        
        # Identify column types from the DataFrame
        self.categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        self.numerical_cols = X.select_dtypes(include=['number']).columns
        
        print(f"  Found {len(self.numerical_cols)} numerical features.")
        print(f"  Found {len(self.categorical_cols)} categorical features.")
        
        for c in self.classes:
            # 1. Calculate Class Priors: P(y)
            X_c = X[y == c]
            self.class_priors[c] = X_c.shape[0] / n_samples
            
            # 2. Calculate Gaussian Parameters (Mean, Std) for numerical features
            self.gaussian_params[c] = {}
            for col in self.numerical_cols:
                mean = X_c[col].mean()
                std = X_c[col].std()
                self.gaussian_params[c][col] = (mean, std)
                
            # 3. Calculate Likelihoods for categorical features: P(x_i | y)
            self.likelihoods[c] = {}
            for col in self.categorical_cols:
                counts = X_c[col].value_counts()
                total_count = X_c.shape[0]
                
                # Get all unique values for this feature from the *entire* dataset
                # This is for the denominator in Laplace smoothing
                n_unique_values = len(X[col].unique())
                
                # Store probabilities for each value in this feature
                self.likelihoods[c][col] = {}
                for val, count in counts.items():
                    # Apply Laplace Smoothing
                    self.likelihoods[c][col][val] = (count + self.laplace) / (total_count + (n_unique_values * self.laplace))
                
                # Handle values that might be in other classes but not this one
                # This probability is for any value in the test set that wasn't
                # seen for this class 'c' during training.
                self.likelihoods[c][col]['_unknown_'] = self.laplace / (total_count + (n_unique_values * self.laplace))

        print("Training complete.")

    def predict(self, X):
        """Makes predictions for new data."""
        
        # *********** THE FIX IS HERE ***********
        # Changed 'if not self.classes:' to 'if self.classes is None:'
        # This correctly checks if the model has been trained.
        if self.classes is None:
            raise Exception("Model not trained yet. Call fit() first.")
        
        y_pred = []
        
        # Iterate over each sample (row) in the test set
        for _, row in X.iterrows():
            posteriors = {}
            
            # Calculate posterior probability for each class
            for c in self.classes:
                # Start with the log of the class prior
                # We use logs to prevent numerical underflow (multiplying many small probs)
                log_posterior = np.log(self.class_priors[c])
                
                # Add log-likelihood for numerical features
                for col in self.numerical_cols:
                    mean, std = self.gaussian_params[c][col]
                    prob = self._gaussian_pdf(row[col], mean, std)
                    # Add small epsilon to avoid log(0)
                    log_posterior += np.log(prob + 1e-9)
                    
                # Add log-likelihood for categorical features
                for col in self.categorical_cols:
                    val = row[col]
                    # Get prob, defaulting to the '_unknown_' prob if value wasn't seen
                    prob = self.likelihoods[c][col].get(val, self.likelihoods[c][col]['_unknown_'])
                    log_posterior += np.log(prob)
                
                posteriors[c] = log_posterior
            
            # The class with the highest log-posterior is the prediction
            y_pred.append(max(posteriors, key=posteriors.get))
            
        return np.array(y_pred)

# --- Q2: Main Execution ---

def run_part_2():
    print("\n===========================================")
    print("         Running Part 2: Bank Data         ")
    print("===========================================")
    
    # 1. Load Data
    print("Loading data...")
    try:
        # The bank-full.csv file uses semicolons as delimiters
        data = pd.read_csv("Assignment-10/bank-full.csv", sep=';')
        print("bank-full.csv loaded.")
    except FileNotFoundError:
        print("Error: bank-full.csv not found.")
        return

    # 2. Prepare Data
    print("Preparing data...")
    # Convert target 'y' to numerical (0 or 1)
    # This makes it easier for our from-scratch metrics
    data['y'] = data['y'].map({'no': 0, 'yes': 1})
    
    X = data.drop('y', axis=1)
    y = data['y']

    # 3. Create Train/Test Split
    # *** ASSUMPTION ***
    # As Assignment 9 is not available, we create a standard 80/20 split.
    # We use random_state=42 for reproducibility.
    print("Creating 80/20 train/test split (as per assumption)...")
    
    # Simple split without scikit-learn
    # We shuffle the data first
    data_shuffled = data.sample(frac=1, random_state=42)
    split_point = int(0.8 * len(data_shuffled))
    
    train_data = data_shuffled[:split_point]
    test_data = data_shuffled[split_point:]
    
    X_train = train_data.drop('y', axis=1)
    y_train = train_data['y']
    X_test = test_data.drop('y', axis=1)
    y_test = test_data['y']

    print(f"Training set size: {X_train.shape[0]} samples")
    print(f"Test set size:     {X_test.shape[0]} samples")

    # 4. Train the Model
    nb_model = MixedNaiveBayes(laplace=1)
    nb_model.fit(X_train, y_train)

    # 5. Compute Evaluation Metrics on Test Set
    print("Computing metrics on test set...")
    y_pred_test = nb_model.predict(X_test)
    
    # Use the same from-scratch metrics from Part 0
    print_evaluation_metrics(y_test, y_pred_test, title="Q2: Naive Bayes Metrics (Test Set)")


################################################################################
#
#                           MAIN EXECUTION
#
################################################################################

# We need to wrap the main execution in a way that can be run
# by the code interpreter.
try:
    # Run Part 1 (MNIST)
    run_part_1()
    
    # Run Part 2 (Bank Data)
    run_part_2()
    
except FileNotFoundError as e:
    print(f"\nExecution failed: {e}")
    print("Please ensure 'train.csv', 'test.csv', and 'bank-full.csv' are uploaded.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

         Running Part 1: MNIST             
Loading data...
train.csv and test.csv loaded.
Preparing data...
Creating 80/20 train/validation split...
Training set size:   33600 samples
Validation set size: 8400 samples
SoftmaxRegression initialized (lr=0.1, iterations=300)
Starting model training...
  Iteration 50/300, Loss: 0.8000
  Iteration 100/300, Loss: 0.6066
  Iteration 150/300, Loss: 0.5287
  Iteration 200/300, Loss: 0.4847
  Iteration 250/300, Loss: 0.4556
  Iteration 300/300, Loss: 0.4347
Training complete.
Computing metrics on validation set...

--- Q1: Softmax Regression Metrics (Validation Set) ---
Accuracy:  0.8871
Precision (Macro): 0.8861
Recall (Macro):    0.8856
F1-Score (Macro):  0.8854
---------------------------------
Predicting labels for test.csv...
First 10 predictions for test.csv: [2 0 9 7 3 7 0 3 0 3]

--- Demonstrating Helper Functions (as required) ---

Demonstrating convert_array_to_image...
Saved converted_image.png

Demonstrating predict_single_image...
